In [1]:
import pandas as pd
import numpy as np
import textdistance as td
from typing import NamedTuple

In [7]:
class InputFiles(NamedTuple):
    responses: str
    alignments: str
    bad_names: str

class OutputFiles(NamedTuple):
    summary: str

class Env(NamedTuple):
    input_files: InputFiles
    output_files: OutputFiles

if "snakemake" in globals():
    env = Env(
        input_files=snakemake.input,
        output_files=snakemake.output
    )
else: # Fill in parameters manually for testing outside of snakemake
    import os
    ROOT = os.path.expanduser("~/biodiversity-llms")
    # LLM = "gpt-3.5-turbo-0125"
    LLM = "gpt-4-1106-preview"

    env = Env(
        input_files = InputFiles(
            responses=f"../results/us-maps/{LLM}/taxonomy/responses.tsv",
            alignments="../results/us-maps/input/alignments.tsv",
            bad_names="../results/us-maps/input/bad-names.tsv"
        ),
        output_files = OutputFiles(
            summary=f"../results/us-maps/{LLM}/taxonomy/summary.tsv"
        )
    )

def nest(level, strings):
    separator = "\n" + "  " * level + "- "
    return separator + separator.join([str(s) for s in strings])

def quote(strings):
    return [f'"{s}"' for s in strings]

print("Job parameters:")
print(env)

Job parameters:
Env(input_files=InputFiles(responses='../results/us-maps/gpt-4-1106-preview/taxonomy/responses.tsv', alignments='../results/us-maps/input/alignments.tsv', bad_names='../results/us-maps/input/bad-names.tsv'), output_files=OutputFiles(summary='../results/us-maps/gpt-4-1106-preview/taxonomy_performance.tsv'))


In [3]:
env.input_files

InputFiles(responses='../results/us-maps/gpt-3.5-turbo-0125/taxonomy/responses.tsv', alignments='../results/us-maps/input/alignments.tsv', bad_names='../results/us-maps/input/bad-names.tsv')

In [4]:
for k in env.input_files:
    print(k)

../results/us-maps/gpt-3.5-turbo-0125/taxonomy/responses.tsv
../results/us-maps/input/alignments.tsv
../results/us-maps/input/bad-names.tsv


In [8]:
GARBAGE_WORDS = {
    "the",
    "only",
    "that",
    "this",
    "sorry",
    "there",
    "theres",
    "apologies",
}

# bad_names = pd.read_csv(env.input_files.bad_names, sep="\t", dtype=str)\
#     .set_index("providedName").index.str.lower().union([""])

def clean_response(response: str, object_rank: str):
    response = response.lower()
    s = str(response).split()[0] # Get the first word

    # For responses like "Phylum: Mollusca", use the second word instead
    if s[-1] == ":" or s == object_rank:
        s = str(response).split()[1]

    s = "".join([c for c in s if c.isalpha()]) # Filter out non-letters
    return s

def clean_response_list(responses: pd.Series, subject: str, object_rank: str):
    return list(filter(lambda x: not is_garbage_response(x, subject, object_rank), [clean_response(a, object_rank) for a in responses]))

def is_garbage_response(response, subject, object_rank):
    response = clean_response(response, object_rank)
    return response == subject or len(response) <= 2 or response in GARBAGE_WORDS

def mark_garbage_responses(responses, subject, object_rank):
    return map(lambda x: is_garbage_response(x, subject, object_rank), responses)

def clean_taxa(name):
    name = name.split()
    return "".join(filter(str.isalpha, name))
    
conv = {
    "taxon": clean_taxa,
    "responses": eval
}

responses = pd.read_csv(env.input_files.responses, sep="\t", converters=conv)
responses = responses[responses["kingdom"].isin(("animalia", "plantae"))]
responses = responses.drop(columns=["query", "question number", "input token count", "output token count"])
# responses = responses[~responses["taxon"].isin(bad_names)]

responses["garbage responses"] = responses.apply(axis=1, func=lambda x: sum(mark_garbage_responses(x["responses"], x["taxon"], x["object_rank"])))
responses["responses"] = responses.apply(axis=1, func=lambda x: clean_response_list(x["responses"], x["taxon"], x["object_rank"]))

scores = responses.copy()
responses.head(3)

Unnamed: 0,subject_rank,object_rank,taxon,kingdom,phylum,class,order,family,genus,Unnamed: 9,responses,garbage responses
0,phylum,kingdom,tracheophyta,plantae,,,,,,,"[plantae, plantae, plantae, plantae, plantae, ...",0
1,class,kingdom,magnoliopsida,plantae,tracheophyta,,,,,,"[plantae, plantae, plantae, plantae, plantae, ...",0
2,class,phylum,magnoliopsida,plantae,tracheophyta,,,,,,"[magnoliophyta, magnoliophyta, magnoliophyta, ...",0


### Score against Nomer alignment

In [10]:
# Nomer alignment
conv = {
    "aliases": eval,
    "classification": eval
}

alignment = pd.read_csv(env.input_files.alignments, sep="\t", index_col=0, converters=conv)
alignment.head(3)

Unnamed: 0_level_0,aliases,classification
name,Unnamed: 1_level_1,Unnamed: 2_level_1
acer,{acer},"{(infrakingdom, streptophyta), (, cellular), (..."
amorpha,"{amorpha, laothoe}","{(clade, protostomia), (, pentapetalae), (king..."
aves,{aves},"{(clade, teleostomi), (clade, sauropsida), (su..."


In [14]:
def score_partial_match(response, name):
    return td.jaro_winkler(response, name, prefix_weight=0.2)

def score_best_partial_match_any_rank(response, full_classification):
    best_score = 0
    best_match = None
    for rank, name in full_classification:
        score = score_partial_match(response, name)
        if score > best_score:
            best_score = score
            if score >= 1:
                return 1
    return best_score

def nomer_partial_match_scores(row):
    taxon = row["taxon"]
    if taxon in alignment.index:
        classification = alignment.loc[row["taxon"]]["classification"]
        scores = list(map(lambda r: score_best_partial_match_any_rank(r, classification), row["responses"]))
    else:
        scores = list()

    return pd.Series({
        "partial match best": np.max(scores) if len(scores) > 0 else np.nan,
        "partial match mean": np.mean(scores),
        "partial match var": np.var(scores),
    })

row = responses.iloc[0]
nomer_partial_match_scores(row)

partial match best    1.0
partial match mean    1.0
partial match var     0.0
dtype: float64

In [15]:
def nomer_exact_match(subject, rank, response, classification):
    if subject in alignment.index:
        return 1.0 if any([x[1] == response for x in classification if x[0] == rank]) else 0.0
    else:
        return np.nan

def nomer_exact_match_at_any_rank(subject, response, classification):
    if subject in alignment.index:
        return 1.0 if any([x[1] == response for x in classification]) else 0.0
    else:
        return np.nan

def nomer_rank_exact_match_scores(row):
    taxon = row["taxon"]
    if taxon in alignment.index:
        classification = alignment.loc[row["taxon"]]["classification"]
        scores = list(map(lambda r: nomer_exact_match(row["taxon"], row["object_rank"], r, classification), row["responses"]))
    else:
        scores = list()

    return pd.Series({
        "rank exact match best": np.max(scores) if len(scores) > 0 else np.nan,
        "rank exact match mean": np.mean(scores)
    })

def nomer_any_exact_match_scores(row):
    taxon = row["taxon"]
    if taxon in alignment.index:
        classification = alignment.loc[row["taxon"]]["classification"]
        scores = list(map(lambda r: nomer_exact_match_at_any_rank(row["taxon"], r, classification), row["responses"]))
    else:
        scores = list()

    return pd.Series({
        "any exact match best": np.max(scores) if len(scores) > 0 else np.nan,
        "any exact match mean": np.mean(scores)
    })

In [16]:
scores = scores.join(scores.apply(axis=1, func=nomer_partial_match_scores))
scores = scores.join(scores.apply(axis=1, func=nomer_rank_exact_match_scores))
scores = scores.join(scores.apply(axis=1, func=nomer_any_exact_match_scores))

In [17]:
scores.head(3)

Unnamed: 0,subject_rank,object_rank,taxon,kingdom,phylum,class,order,family,genus,Unnamed: 9,responses,garbage responses,partial match best,partial match mean,partial match var,rank exact match best,rank exact match mean,any exact match best,any exact match mean
0,phylum,kingdom,tracheophyta,plantae,,,,,,,"[plantae, plantae, plantae, plantae, plantae, ...",0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
1,class,kingdom,magnoliopsida,plantae,tracheophyta,,,,,,"[plantae, plantae, plantae, plantae, plantae, ...",0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
2,class,phylum,magnoliopsida,plantae,tracheophyta,,,,,,"[magnoliophyta, magnoliophyta, magnoliophyta, ...",0,0.969231,0.969231,1.2325950000000001e-32,0.0,0.0,0.0,0.0


In [18]:
only_scores = scores[scores["partial match best"] >= 0]

In [19]:
only_scores.groupby("subject_rank")[['garbage responses',
       'partial match best', 'partial match mean', 'partial match var',
       'rank exact match best', 'rank exact match mean',
       'any exact match best', 'any exact match mean']].mean().loc[['phylum', 'class', 'order', 'family', 'genus']]\
              .map(lambda x: f"{x:.2}")

Unnamed: 0_level_0,garbage responses,partial match best,partial match mean,partial match var,rank exact match best,rank exact match mean,any exact match best,any exact match mean
subject_rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
phylum,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
class,0.0,0.99,0.99,3.1e-33,0.75,0.75,0.75,0.75
order,0.0,0.99,0.99,2.1000000000000003e-33,0.83,0.83,0.83,0.83
family,0.0,1.0,1.0,1.5e-33,0.81,0.79,0.88,0.88
genus,0.0,1.0,1.0,6.2000000000000004e-34,0.85,0.85,0.95,0.95


In [20]:
only_scores.groupby("object_rank")[['garbage responses',
       'partial match best', 'partial match mean', 'partial match var',
       'rank exact match best', 'rank exact match mean',
       'any exact match best', 'any exact match mean']].mean().loc[['kingdom', 'phylum', 'class', 'order', 'family']]\
              .map(lambda x: f"{x:.2}")

Unnamed: 0_level_0,garbage responses,partial match best,partial match mean,partial match var,rank exact match best,rank exact match mean,any exact match best,any exact match mean
object_rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
kingdom,0.0,1.0,1.0,0.0,1.0,0.98,1.0,1.0
phylum,0.0,0.99,0.99,5.4e-33,0.38,0.38,0.56,0.56
class,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
order,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
family,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0


In [21]:
only_scores.groupby(["subject_rank", "object_rank"])[['rank exact match mean',
       'any exact match mean']].mean().sort_values("rank exact match mean", ascending=False)\
              .map(lambda x: f"{x:.0%}")

Unnamed: 0_level_0,Unnamed: 1_level_0,rank exact match mean,any exact match mean
subject_rank,object_rank,Unnamed: 2_level_1,Unnamed: 3_level_1
class,kingdom,100%,100%
family,class,100%,100%
family,order,100%,100%
genus,class,100%,100%
genus,family,100%,100%
genus,kingdom,100%,100%
genus,order,100%,100%
order,class,100%,100%
order,kingdom,100%,100%
phylum,kingdom,100%,100%


In [22]:
summary = only_scores.groupby(["subject_rank", "taxon"])[['garbage responses',
       'partial match best', 'partial match mean', 'partial match var',
       'rank exact match best', 'rank exact match mean',
       'any exact match best', 'any exact match mean']].mean()
summary.map(lambda x: f"{x:.0%}")

Unnamed: 0_level_0,Unnamed: 1_level_0,garbage responses,partial match best,partial match mean,partial match var,rank exact match best,rank exact match mean,any exact match best,any exact match mean
subject_rank,taxon,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
class,aves,0%,100%,100%,0%,100%,100%,100%,100%
class,magnoliopsida,0%,98%,98%,0%,50%,50%,50%,50%
class,mammalia,0%,100%,100%,0%,100%,100%,100%,100%
family,dasypodidae,0%,100%,100%,0%,100%,100%,100%,100%
family,fabaceae,0%,99%,99%,0%,75%,75%,75%,75%
family,picidae,0%,100%,100%,0%,75%,65%,100%,100%
family,sapindaceae,0%,99%,99%,0%,75%,75%,75%,75%
genus,acer,0%,99%,99%,0%,80%,80%,80%,80%
genus,amorpha,0%,100%,100%,0%,80%,80%,100%,100%
genus,dasypus,0%,100%,100%,0%,100%,100%,100%,100%


In [24]:
summary.to_csv(env.output_files.summary, sep="\t")