# Process and grade LLM responses

In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from typing import NamedTuple

In [3]:
class InputFiles(NamedTuple):
    responses: str
    taxonomy_scores: str
    taxonomy: str
    record_counts_by_phylum: str
    record_counts_by_family: str
    record_counts_by_genus: str
    record_counts_by_species: str
    record_counts_by_stateprovince: str
    record_counts_by_county: str

class OutputFiles(NamedTuple):
    results_with_ums: str

class Env(NamedTuple):
    input_files: InputFiles
    output_files: OutputFiles
    num_phrasings: int
    phrasings: list[str]
    query_fields: list[str]
    seed: int
    only_valid_absences: bool

if "snakemake" in globals():
    env = Env(
        input_files=snakemake.input,
        output_files=snakemake.output,
        num_phrasings=len(snakemake.params.phrasings),
        phrasings=snakemake.params.phrasings,
        query_fields=snakemake.params.query_fields,
        seed=snakemake.params.seed,
        only_valid_absences=snakemake.params.validate_absences
    )
else: # Fill in parameters manually for testing outside of snakemake
    import os
    ROOT = os.path.expanduser("~/biodiversity-llms")
    # LLM = "gpt-3.5-turbo-0125"
    LLM = "gpt-4-1106-preview"

    env = Env(
        input_files = InputFiles(
            f"../../results/idigbio-sample/{LLM}/occurrence/responses.tsv",
            f"../../results/idigbio-sample/{LLM}/taxonomy/summary.tsv",
            "../../results/idigbio-sample/input/taxa-genus.tsv",
            "../../results/idigbio-sample/input/record-counts-by-phylum.tsv",
            "../../results/idigbio-sample/input/record-counts-by-family.tsv",
            "../../results/idigbio-sample/input/record-counts-by-genus.tsv",
            "../../results/idigbio-sample/input/record-counts-by-species.tsv",
            "../../results/idigbio-sample/input/record-counts-by-stateprovince.tsv",
            "../../results/idigbio-sample/input/record-counts-by-county.tsv",
        ),
        output_files = OutputFiles(
            results_with_ums=f"../../results/idigbio-sample/{LLM}/results-with-ums.tsv"
        ),
        num_phrasings=6,
        phrasings=[
            "Can species {genus} {specificepithet} be found in {county}, {stateprovince}, {country}?",
            "Is it possible to encounter species {genus} {specificepithet} in {county}, {stateprovince}, {country}?",
            "Is there a presence of species {genus} {specificepithet} within {county}, {stateprovince}, {country}?",
            "Does {county}, {stateprovince}, {country} harbor species {genus} {specificepithet}?",
            "Is species {genus} {specificepithet} present in {county}, {stateprovince}, {country}?",
            "Can one observe species {genus} {specificepithet} in {county}, {stateprovince}, {country}?"
        ],
        query_fields=[
            "kingdom",
            "phylum",
            "family",
            "genus",
            "specificepithet",
            "country",
            "stateprovince",
            "county"
        ],
        seed=69847,
        only_valid_absences=True
    )

def nest(level, strings):
    separator = "\n" + "  " * level + "- "
    return separator + separator.join([str(s) for s in strings])

def quote(strings):
    return [f'"{s}"' for s in strings]

print("Job parameters:")
print(f"- Responses to analyze: {env.input_files.responses}")
print(f"- Query phrasings (count: {env.num_phrasings}):{nest(2, quote(env.phrasings))}")
print(f"- Query fields:{nest(2, quote(env.query_fields))}")


Job parameters:
- Responses to analyze: ../../results/idigbio-sample/gpt-4-1106-preview/occurrence/responses.tsv
- Query phrasings (count: 6):
    - "Can species {genus} {specificepithet} be found in {county}, {stateprovince}, {country}?"
    - "Is it possible to encounter species {genus} {specificepithet} in {county}, {stateprovince}, {country}?"
    - "Is there a presence of species {genus} {specificepithet} within {county}, {stateprovince}, {country}?"
    - "Does {county}, {stateprovince}, {country} harbor species {genus} {specificepithet}?"
    - "Is species {genus} {specificepithet} present in {county}, {stateprovince}, {country}?"
    - "Can one observe species {genus} {specificepithet} in {county}, {stateprovince}, {country}?"
- Query fields:
    - "kingdom"
    - "phylum"
    - "family"
    - "genus"
    - "specificepithet"
    - "country"
    - "stateprovince"
    - "county"


In [4]:
kpfg_class_order = pd.read_csv(open(env.input_files.taxonomy, "r"), sep="\t")\
    .set_index(["kingdom", "phylum", "family", "taxon"])[["class", "order"]]

In [5]:
def count_item(values, item):
    counts = dict(Counter(values).most_common())
    return counts[item] if item in counts else 0

def make_id(df):
    return df.apply(lambda r: hash("".join([str(v) for v in r.values]).lower()), axis=1)

def get_results(responses):
    df = pd.read_csv(open(responses), sep="\t")

    df["genus"] = df["genus"].str.lower()
    df = df[df["family"] != df["genus"]]

    df[["class", "order"]] = df.apply(axis=1, func=lambda r: kpfg_class_order.loc[r["kingdom"], r["phylum"], r["family"], r["genus"]].iloc[0])

    df["phrasing"] = df["question number"].astype(int) % env.num_phrasings
    df["query id"] = make_id(df[env.query_fields])

    df["response id"] = make_id(df[["query id", "phrasing"]])
    df = df.groupby("response id").head(1) # Drop responses for repeated questions

    df["scores"] = df["responses"].apply(lambda r: count_item(r.lower().split(), "yes"))
    df["yesnos"] = df["responses"].apply(lambda r: count_item(r.lower().split(), "yes") + count_item(r.lower().split(), "no"))
    df["abstains"] = 10 - df["yesnos"]

    df["prediction"] = df["scores"].apply(lambda x: -1 if x == 0 else 1)
    df["target"] = (df["present"] == "Yes").astype(int) * 2 - 1
    df["correct"] = df["prediction"] * df["target"]

    return df

UNUSED_FIELDS = ["query"]
res = get_results(env.input_files.responses)\
    .drop(columns=UNUSED_FIELDS)

print(f"{len(res) / env.num_phrasings:,.0f} records")
print(f"{len(res):,.0f} queries (#records x #phrasings)")
res.head(1)

  df[["class", "order"]] = df.apply(axis=1, func=lambda r: kpfg_class_order.loc[r["kingdom"], r["phylum"], r["family"], r["genus"]].iloc[0])


23,334 records
140,004 queries (#records x #phrasings)


Unnamed: 0,kingdom,phylum,family,genus,specificepithet,country,stateprovince,county,present,responses,...,order,phrasing,query id,response id,scores,yesnos,abstains,prediction,target,correct
7,animalia,porifera,microcionidae,clathria,aculeofila,Mexico,Nayarit,Bahia De Banderas,Yes,No No No No No No No No No No,...,poecilosclerida,1,5999046189242602981,-3707963387680935750,0,10,0,-1,1,-1


In [6]:
def find_optimal_decision_threshold():
    def decision(x):
        return x.astype(int) * 2 - 1

    scores = res["scores"]
    thresholds = np.arange(scores.min(), scores.max(), (scores.max() - scores.min()) / 1000)
    accuracies = np.array(list(map(lambda t: (decision(scores >= t) == res["target"]).mean(), thresholds)))
    best_accuracy_threshold = accuracies.argmax()
    print(f"Best accuracy threshold: between {thresholds[best_accuracy_threshold - 1]:.1g} - {thresholds[best_accuracy_threshold]:.1g}")
find_optimal_decision_threshold()

Best accuracy threshold: between 0 - 0.01


Query-level statistics

In [7]:
tax_data = pd.read_csv(open(env.input_files.taxonomy_scores, "r"), sep="\t")\
    .set_index(["subject_rank", "taxon"])

In [8]:
tax_scores = tax_data["rank exact match mean"]
tax_scores.head(3)

subject_rank  taxon         
class         actinopterygii    1.0
              amphibia          1.0
              angiospermae      1.0
Name: rank exact match mean, dtype: float64

In [9]:
tax_garbage_counts = tax_data["garbage responses"]
tax_garbage_counts.head(3)

subject_rank  taxon         
class         actinopterygii    0.0
              amphibia          0.0
              angiospermae      0.0
Name: garbage responses, dtype: float64

In [10]:
average_tax_garbage_counts = tax_garbage_counts.groupby("subject_rank").mean()
average_tax_garbage_counts.head(3)

subject_rank
class     0.196970
family    0.183433
genus     0.645909
Name: garbage responses, dtype: float64

In [11]:
average_tax_scores = tax_scores.groupby("subject_rank").mean()
average_tax_scores

subject_rank
class     0.875758
family    0.846596
genus     0.752374
order     0.844753
phylum    1.000000
Name: rank exact match mean, dtype: float64

In [12]:
def get_record_counts(path, fields):
    df = pd.read_csv(open(path, "r"), sep="\t")
    if "genus" in df.columns:
        df["genus"] = df["genus"].str.lower()
    return df.groupby(fields)["record count"].first()

record_counts_by_phylum = get_record_counts(env.input_files.record_counts_by_phylum, ["kingdom", "phylum"])
record_counts_by_family = get_record_counts(env.input_files.record_counts_by_family, ["kingdom", "phylum", "family"])
record_counts_by_genus = get_record_counts(env.input_files.record_counts_by_genus, ["kingdom", "phylum", "family", "genus"])
record_counts_by_species = get_record_counts(env.input_files.record_counts_by_species, ["kingdom", "phylum", "family", "genus", "specificepithet"])
record_counts_by_stateprovince = get_record_counts(env.input_files.record_counts_by_stateprovince, ["country", "stateprovince"])
record_counts_by_county = get_record_counts(env.input_files.record_counts_by_county, ["country", "stateprovince", "county"])

In [13]:
phrasing_avg_pred = res.groupby("query id")["prediction"].mean()
phrasing_var_score = res.groupby("query id")["scores"].var()

In [14]:
def show_accuracies(df, field, title, remap_values={}, format="{:,.2%}"):
    df = pd.DataFrame(
        (df.groupby(field)["correct"].mean() * .5 + .5)
        .rename("Response accuracy")
    ).transpose()
    df.columns.name = title
    df.rename(columns=remap_values, inplace=True)

    display(df.style.format(format))

show_accuracies(res, "phrasing", "Phrasing")
show_accuracies(res, "target", "Actual presence", remap_values={-1: "Absent", 1: "Present"})
show_accuracies(res, "prediction", "Predicted presence", remap_values={-1: "Absent", 1: "Present"})
show_accuracies(res, "kingdom", "Kingdom")

Phrasing,0,1,2,3,4,5
Response accuracy,64.28%,64.54%,64.55%,64.19%,64.10%,63.86%


Actual presence,Absent,Present
Response accuracy,86.41%,43.44%


Predicted presence,Absent,Present
Response accuracy,58.93%,77.30%


Kingdom,animalia,plantae
Response accuracy,63.87%,64.74%


In [15]:
import regex as re
def trim_county_name(county):
    parts = county.split()
    if len(parts) > 1 and re.sub(r'[^\w]', '', parts[-1]).lower() in ("co", "county", "mun", "par", "prov"):
        return " ".join(parts[:-1])
    else:
        return county

In [18]:
df = res[res["phrasing"] == 0]

def get_ums(instance):
    # TODO: condition um3 on prediction?
    num_responses = 10 - instance["abstains"]
    global r
    r = instance

    # Higher values = more certainty
    return pd.Series({
        # Scores
        "um1_total_score": instance["scores"] if instance["prediction"] == 1 else num_responses - instance["scores"],
        "um1_percent_score": (instance["scores"] if instance["prediction"] == 1 else num_responses - instance["scores"]) / max(1, num_responses),

        # Abstains
        "um2_abstains": num_responses,

        # Sensitivity to phrasing
        "um4_phrasing_agreement": phrasing_avg_pred[instance["query id"]] * instance["prediction"],
        "um4_phrasing_score_var": phrasing_var_score[instance["query id"]] * -1,

        # iDigBio record counts by taxonomic ranks
        # "um5_record_count_by_kingdom": taxon_record_counts["kingdomCount"],
        "um5_record_count_by_phylum": record_counts_by_phylum.loc[instance["kingdom"], instance["phylum"]],
        "um5_record_count_by_family": record_counts_by_family.loc[instance["kingdom"], instance["phylum"], instance["family"]],
        "um5_record_count_by_genus": record_counts_by_genus.loc[instance["kingdom"], instance["phylum"], instance["family"], instance["genus"]],
        "um5_record_count_by_species": record_counts_by_species.loc[instance["kingdom"], instance["phylum"], instance["family"], instance["genus"], instance["specificepithet"]],
        "um5_record_count_by_stateprovince": record_counts_by_stateprovince.loc[instance["country"], instance["stateprovince"]],
        "um5_record_count_by_county": record_counts_by_county.loc[instance["country"], instance["stateprovince"], trim_county_name(instance["county"])],

        # Accuracy on taxonomy questions
        "um6_taxqa_accuracy_by_phylum": tax_scores["phylum"].get(instance["phylum"], average_tax_scores["phylum"]),
        "um6_taxqa_accuracy_by_class": tax_scores["class"].get(instance["class"], average_tax_scores["class"]),
        "um6_taxqa_accuracy_by_order": tax_scores["order"].get(instance["order"], average_tax_scores["order"]),
        "um6_taxqa_accuracy_by_family": tax_scores["family"].get(instance["family"], average_tax_scores["family"]),
        "um6_taxqa_accuracy_by_genus": tax_scores["genus"].get(instance["genus"], average_tax_scores["genus"]),

        # Number of yes-no responses to taxonomy questions
        "um7_taxqa_responses_by_phylum": -tax_garbage_counts["phylum"].get(instance["phylum"], average_tax_garbage_counts["phylum"]) / 10,
        "um7_taxqa_responses_by_class": -tax_garbage_counts["class"].get(instance["class"], average_tax_garbage_counts["class"]) / 20,
        "um7_taxqa_responses_by_order": -tax_garbage_counts["order"].get(instance["order"], average_tax_garbage_counts["order"]) / 30,
        "um7_taxqa_responses_by_family": -tax_garbage_counts["family"].get(instance["family"], average_tax_garbage_counts["family"]) / 40,
        "um7_taxqa_responses_by_genus": -tax_garbage_counts["genus"].get(instance["genus"], average_tax_garbage_counts["genus"]) / 50,
    })

full_df = pd.concat([df.apply(lambda row: get_ums(row), axis=1), df], axis=1)
full_df.to_csv(env.output_files.results_with_ums, sep="\t")