# Process and grade LLM responses

In [1]:
import pandas as pd
from collections import Counter
from typing import NamedTuple
import numpy as np

In [2]:
class InputFiles(NamedTuple):
    responses: str
    taxonomy_scores: str
    taxon_counts: str
    train_test_split: str

class OutputFiles(NamedTuple):
    results_with_ums: str

class Env(NamedTuple):
    input_files: InputFiles
    output_files: OutputFiles
    num_phrasings: int
    phrasings: list[str]
    query_fields: list[str]
    seed: int
    only_valid_absences: bool

if "snakemake" in globals():
    env = Env(
        input_files=snakemake.input,
        output_files=snakemake.output,
        num_phrasings=len(snakemake.params.phrasings),
        phrasings=snakemake.params.phrasings,
        query_fields=snakemake.params.query_fields,
        seed=snakemake.params.seed,
        only_valid_absences=snakemake.params.validate_absences
    )
else: # Fill in parameters manually for testing outside of snakemake
    import os
    ROOT = os.path.expanduser("~/biodiversity-llms")

    env = Env(
        input_files = InputFiles(
            responses="../../results/llama2-7b-chat/all-shuffled.tsv",
            taxonomy_scores=ROOT + "/tdwg2023/taxonomy/results/kpfg_scores.tsv",
            taxon_counts=ROOT + "/tdwg2023/taxonomy/results/taxon-counts.tsv",
            train_test_split="../../results/input/train_test_split.tsv"
        ),
        output_files = OutputFiles(
            results_with_ums="../../results/llama2-7b-chat/results-with-ums.tsv"
        ),
        num_phrasings=6,
        phrasings=[
            "Can species {genus} {specificepithet} be found in {county}, {stateprovince}, {country}?",
            "Is it possible to encounter species {genus} {specificepithet} in {county}, {stateprovince}, {country}?",
            "Is there a presence of species {genus} {specificepithet} within {county}, {stateprovince}, {country}?",
            "Does {county}, {stateprovince}, {country} harbor species {genus} {specificepithet}?",
            "Is species {genus} {specificepithet} present in {county}, {stateprovince}, {country}?",
            "Can one observe species {genus} {specificepithet} in {county}, {stateprovince}, {country}?"
        ],
        query_fields=[
            "kingdom",
            "phylum",
            "family",
            "genus",
            "specificepithet",
            "country",
            "stateprovince",
            "county"
        ],
        seed=69847,
        only_valid_absences=True
    )

def nest(level, strings):
    separator = "\n" + "  " * level + "- "
    return separator + separator.join([str(s) for s in strings])

def quote(strings):
    return [f'"{s}"' for s in strings]

print("Job parameters:")
print(f"- Responses to analyze: {env.input_files.responses}")
print(f"- Query phrasings (count: {env.num_phrasings}):{nest(2, quote(env.phrasings))}")
print(f"- Query fields:{nest(2, quote(env.query_fields))}")


Job parameters:
- Responses to analyze: ../../results/llama2-7b-chat/all-shuffled.tsv
- Query phrasings (count: 6):
    - "Can species {genus} {specificepithet} be found in {county}, {stateprovince}, {country}?"
    - "Is it possible to encounter species {genus} {specificepithet} in {county}, {stateprovince}, {country}?"
    - "Is there a presence of species {genus} {specificepithet} within {county}, {stateprovince}, {country}?"
    - "Does {county}, {stateprovince}, {country} harbor species {genus} {specificepithet}?"
    - "Is species {genus} {specificepithet} present in {county}, {stateprovince}, {country}?"
    - "Can one observe species {genus} {specificepithet} in {county}, {stateprovince}, {country}?"
- Query fields:
    - "kingdom"
    - "phylum"
    - "family"
    - "genus"
    - "specificepithet"
    - "country"
    - "stateprovince"
    - "county"


In [3]:
def count_item(values, item):
    counts = dict(Counter(values).most_common())
    return counts[item] if item in counts else 0

def make_id(df):
    return df.apply(lambda r: hash("".join([str(v) for v in r.values]).lower()), axis=1)

def get_results(responses, train_test_split):
    train_test_split = pd.read_csv(open(train_test_split), sep="\t", index_col=0)
    df = pd.read_csv(open(responses), sep="\t").merge(train_test_split, on=env.query_fields)

    df["phrasing"] = df["question number"].astype(int) % env.num_phrasings
    df["query id"] = make_id(df[env.query_fields])

    df["response id"] = make_id(df[["query id", "phrasing"]])
    df = df.groupby("response id").head(1) # Drop responses for repeated questions

    df["first token top scores"] = df["first token top scores"].map(lambda x: eval(x))
    df["first token top strings"] = df["first token top strings"].map(lambda x: [s.lower() for s in eval(x)])

    df["no score"] = df.apply(lambda r: dict(zip(r["first token top strings"], r["first token top scores"])).get("no", -np.inf), axis=1).astype(np.float32)
    df["yes score"] = df.apply(lambda r: dict(zip(r["first token top strings"], r["first token top scores"])).get("yes", -np.inf), axis=1).astype(np.float32)
    
    from scipy.special import softmax
    def smax(x):
        return softmax(np.array(x).astype(float))
    
    df["no softmax"] = df.apply(lambda r: dict(zip(r["first token top strings"], smax(r["first token top scores"]))).get("no", 0), axis=1).astype(np.float32)
    df["yes softmax"] = df.apply(lambda r: dict(zip(r["first token top strings"], smax(r["first token top scores"]))).get("yes", 0), axis=1).astype(np.float32)

    df["prediction"] = (df["yes softmax"] > 0).astype(int) * 2 - 1 # Note that all response started with either "Yes" or "No"
    
    df["correct"] = df["prediction"] * df["target"]

    return df

UNUSED_FIELDS = ["query"]
res = get_results(env.input_files.responses, env.input_files.train_test_split)\
    .drop(columns=UNUSED_FIELDS)

print(f"{len(res) / env.num_phrasings:,.0f} records")
print(f"{len(res):,.0f} queries (#records x #phrasings)")
res.head(1)

23,505 records
141,030 queries (#records x #phrasings)


Unnamed: 0,kingdom,phylum,family,genus,specificepithet,country,stateprovince,county,present,response,...,train,phrasing,query id,response id,no score,yes score,no softmax,yes softmax,prediction,correct
0,animalia,porifera,microcionidae,Clathria,aculeofila,Mexico,Nayarit,Bahia De Banderas,Yes,'Yes.',...,False,2,3820525251850966572,8589985240222852093,-inf,33.958,0.0,1.0,1,1


In [4]:
def find_optimal_decision_threshold():
    def decision(x):
        return x.astype(int) * 2 - 1

    thresholds = np.arange(0, .01, .00005)
    accuracies = np.array(list(map(lambda t: (decision(res["yes softmax"] >= t) == res["target"]).mean(), thresholds)))
    best_accuracy_threshold = accuracies.argmax()
    print(f"Best accuracy threshold: between {thresholds[best_accuracy_threshold - 1]:.1g} - {thresholds[best_accuracy_threshold]:.1g}")
find_optimal_decision_threshold()

Best accuracy threshold: between 0 - 5e-05


Query-level statistics

In [5]:
kpfg_scores = pd.read_csv(open(env.input_files.taxonomy_scores, "r"), sep="\t")
kpfg_scores["accuracy"] = (1 + kpfg_scores["num_correct"]) / (2 + kpfg_scores["num_response"])
kpfg_scores = kpfg_scores.set_index(["subject rank", "taxon"])
average_kpfg_scores = kpfg_scores.groupby("subject rank").sum().apply(lambda r: r["num_correct"] / r["num_response"], axis=1)
average_kpfg_responses = kpfg_scores.groupby("subject rank")["num_response"].mean()

record_counts_by_taxon = pd.read_csv(open(env.input_files.taxon_counts, "r"), sep="\t").set_index(["kingdom", "phylum", "family"])

In [6]:
phrasing_avg_pred = res.groupby("query id")["prediction"].mean()
phrasing_var_score_no = res.groupby("query id")["no softmax"].var() # Can't compute variance of unnormalized scores when they include -infinity
phrasing_var_score_yes = res.groupby("query id")["yes softmax"].var()
phrasing_var_score = phrasing_var_score_no + phrasing_var_score_yes # TODO: just trying things out!

In [7]:
def show_accuracies(df, field, title, remap_values={}, format="{:,.2%}"):
    df = pd.DataFrame(
        (df.groupby(field)["correct"].mean() * .5 + .5)
        .rename("Response accuracy")
    ).transpose()
    df.columns.name = title
    df.rename(columns=remap_values, inplace=True)

    display(df.style.format(format))

show_accuracies(res, "phrasing", "Phrasing")
show_accuracies(res, "target", "Actual presence", remap_values={-1: "Absent", 1: "Present"})
show_accuracies(res, "prediction", "Predicted presence", remap_values={-1: "Absent", 1: "Present"})
show_accuracies(res, "kingdom", "Kingdom")

Phrasing,0,1,2,3,4,5
Response accuracy,54.56%,54.34%,55.09%,54.81%,54.94%,54.86%


Actual presence,Absent,Present
Response accuracy,70.92%,39.61%


Predicted presence,Absent,Present
Response accuracy,52.43%,59.21%


Kingdom,animalia,plantae
Response accuracy,52.53%,57.63%


In [None]:
def get_acc_by_field(d, field, prior_counts=1) -> pd.Series:
    def fill_blanks(series):
        return series.reindex(d[field].unique(), fill_value=0)
    
    subset = d["train"]
    num_correct = prior_counts + fill_blanks(d[subset * (d["correct"] == 1)].groupby(field).size())
    num_incorrect = prior_counts + fill_blanks(d[subset * (d["correct"] == -1)].groupby(field).size())

    return num_correct / (num_correct + num_incorrect)

df = res[res["phrasing"] == 0]

acc_by_rank = pd.Series({rank: get_acc_by_field(df, rank) for rank in ["kingdom", "phylum", "family"]})
acc_by_country = get_acc_by_field(df, "country")
acc_by_stateprovince = get_acc_by_field(df, "stateprovince")

def get_ums(instance):
    # TODO: condition um3 on prediction?
    taxon_record_counts = record_counts_by_taxon.loc[instance["kingdom"], instance["phylum"], instance["family"]]

    # Positively oriented, i.e. higher values = more certainty
    return pd.Series({
        # Scores
        "um1_prediction_score": instance["yes score"] if instance["prediction"] == 1 else instance["no score"],
        "um1_prediction_softmax": instance["yes softmax"] if instance["prediction"] == 1 else instance["no softmax"],
        "um1_present_score": max(0, instance["yes score"]) if instance["prediction"] == 1 else 0,
        "um1_absent_score": max(0, instance["no score"]) if instance["prediction"] == -1 else 0,
        "um1_present_softmax": instance["yes softmax"] if instance["prediction"] == 1 else 0,
        "um1_absent_softmax": instance["no softmax"] if instance["prediction"] == -1 else 0,
        
        # Abstains

        # Performance by field values
        # Note: defaults to 0.5 if no data (TODO)
        "um3_accuracy_by_kingdom": acc_by_rank["kingdom"][instance["kingdom"]],
        "um3_accuracy_by_phylum": acc_by_rank["phylum"][instance["phylum"]],
        "um3_accuracy_by_family": acc_by_rank["family"][instance["family"]],
        "um3_accuracy_by_country": acc_by_country[instance["country"]],
        "um3_accuracy_by_stateprovince": acc_by_stateprovince[instance["stateprovince"]],

        # Sensitivity to phrasing
        "um4_phrasing_agreement": phrasing_avg_pred[instance["query id"]] * -instance["prediction"],
        "um4_phrasing_score_var": phrasing_var_score[instance["query id"]],

        # iDigBio record counts by taxonomic ranks
        "um5_record_count_by_kingdom": taxon_record_counts["kingdomCount"],
        "um5_record_count_by_phylum": taxon_record_counts["phylumCount"],
        "um5_record_count_by_family": taxon_record_counts["familyCount"],
        # "um5_record_count_by_country": country_record_count,
        # "um5_record_count_by_stateprovince": stateprovince_record_count,

        # Accuracy on taxonomy questions
        "um6_taxqa_accuracy_by_phylum": kpfg_scores["accuracy"]["phylum"].get(instance["phylum"].lower(), average_kpfg_scores["phylum"]),
        "um6_taxqa_accuracy_by_family": kpfg_scores["accuracy"]["family"].get(instance["family"].lower(), average_kpfg_scores["family"]),
        "um6_taxqa_accuracy_by_genus": kpfg_scores["accuracy"]["genus"].get(instance["genus"].lower(), average_kpfg_scores["genus"]),

        # Number of yes-no responses to taxonomy questions
        "um7_taxqa_responses_by_phylum": kpfg_scores["num_response"]["phylum"].get(instance["phylum"].lower(), average_kpfg_responses["phylum"]) / 10,
        "um7_taxqa_responses_by_family": kpfg_scores["num_response"]["family"].get(instance["family"].lower(), average_kpfg_responses["family"]) / 40,
        "um7_taxqa_responses_by_genus": kpfg_scores["num_response"]["genus"].get(instance["genus"].lower(), average_kpfg_responses["genus"]) / 50,
    })

full_df = pd.concat([df.apply(lambda row: get_ums(row), axis=1), df], axis=1)
full_df.to_csv(env.output_files.results_with_ums)