In [25]:
import pandas as pd
import numpy as np
import textdistance as td
from typing import NamedTuple

In [57]:
class InputFiles(NamedTuple):
    responses: str
    alignments: str
    bad_names: str

class OutputFiles(NamedTuple):
    summary: str

class Env(NamedTuple):
    input_files: InputFiles
    output_files: OutputFiles

if "snakemake" in globals():
    env = Env(
        input_files=snakemake.input,
        output_files=snakemake.output
    )
else: # Fill in parameters manually for testing outside of snakemake
    import os
    ROOT = os.path.expanduser("~/biodiversity-llms")
    LLM = "gpt-3.5-turbo-0125" #"gpt-4-1106-preview"

    env = Env(
        input_files = InputFiles(
            responses=f"../../results/{LLM}/taxonomy/responses.tsv",
            alignments="../../results/input/idigbio-sample/alignments.tsv",
            bad_names="../../results/input/idigbio-sample/bad-names.tsv"
        ),
        output_files = OutputFiles(
            summary=f"../../results/{LLM}/taxonomy/summary.tsv"
        )
    )

def nest(level, strings):
    separator = "\n" + "  " * level + "- "
    return separator + separator.join([str(s) for s in strings])

def quote(strings):
    return [f'"{s}"' for s in strings]

print("Job parameters:")
print(env)

Job parameters:
Env(input_files=InputFiles(responses='../../results/gpt-3.5-turbo-0125/taxonomy/responses.tsv', alignments='../../results/input/idigbio-sample/alignments.tsv', bad_names='../../results/input/idigbio-sample/bad-names.tsv'), output_files=OutputFiles(taxonomy_performance='../../results/gpt-3.5-turbo-0125/taxonomy_performance.tsv'))


In [56]:
env.input_files

InputFiles(responses='../../results/gpt-3.5-turbo-0125/taxonomy/responses.tsv', alignment='../../results/input/alignment.tsv', bad_names='../../results/input/idigbio-sample/bad-names.tsv')

In [53]:
for k in env.input_files:
    print(k)

AttributeError: 'InputFiles' object has no attribute 'items'

In [2]:
GARBAGE_WORDS = {
    "the",
    "only",
    "that",
    "this",
    "sorry",
    "there",
    "theres",
    "apologies",
}

if env.input_files.bad_names is not None:
    bad_names = pd.read_csv(env.input_files.bad_names, sep="\t", dtype=str)\
        .set_index("providedName").index.str.lower().union([""])
else:
    bad_names = {}

def clean_response(response: str, object_rank: str):
    response = response.lower()
    s = str(response).split()[0] # Get the first word

    # For responses like "Phylum: Mollusca", use the second word instead
    if s[-1] == ":" or s == object_rank:
        s = str(response).split()[1]

    s = "".join([c for c in s if c.isalpha()]) # Filter out non-letters
    return s

def clean_response_list(responses: pd.Series, subject: str, object_rank: str):
    return list(filter(lambda x: not is_garbage_response(x, subject, object_rank), [clean_response(a, object_rank) for a in responses]))

def is_garbage_response(response, subject, object_rank):
    response = clean_response(response, object_rank)
    return response == subject or len(response) <= 2 or response in GARBAGE_WORDS

def mark_garbage_responses(responses, subject, object_rank):
    return map(lambda x: is_garbage_response(x, subject, object_rank), responses)

def clean_taxa(name):
    name = name.split()
    return "".join(filter(str.isalpha, name))
    
conv = {
    "taxon": clean_taxa,
    "responses": eval
}

responses = pd.read_csv(env.input_files.responses, sep="\t", converters=conv)
responses = responses[responses["kingdom"].isin(("animalia", "plantae"))]
responses = responses.drop(columns=["query", "question number", "input token count", "output token count"])
responses = responses[~responses["taxon"].isin(bad_names)]

responses["garbage responses"] = responses.apply(axis=1, func=lambda x: sum(mark_garbage_responses(x["responses"], x["taxon"], x["object_rank"])))
responses["responses"] = responses.apply(axis=1, func=lambda x: clean_response_list(x["responses"], x["taxon"], x["object_rank"]))

scores = responses.copy()
responses.head(3)

Unnamed: 0,subject_rank,object_rank,taxon,kingdom,phylum,class,order,family,genus,responses,garbage responses
0,phylum,kingdom,annelida,animalia,,,,,,"[animalia, animalia, animalia, animalia, anima...",0
1,phylum,kingdom,arthropoda,animalia,,,,,,"[animalia, animalia, animalia, animalia, anima...",3
2,phylum,kingdom,brachiopoda,animalia,,,,,,"[animalia, animalia, animalia, animalia, anima...",0


### Score against Nomer alignment

In [3]:
# Nomer alignment
conv = {
    "aliases": eval,
    "classification": eval
}

alignment = pd.read_csv(env.input_files.alignments, sep="\t", index_col=0, converters=conv)
alignment.head(3)

Unnamed: 0_level_0,aliases,classification
name,Unnamed: 1_level_1,Unnamed: 2_level_1
abrothrix,{abrothrix},"{(null, boreoeutheria), (clade, deuterostomia)..."
acacia,"{hantavirus, acacia}","{(subkingdom, viridiplantae), (genus, orthohan..."
acaciella,{acaciella},"{(, root), (, embryophyta), (phylum, cyanobact..."


In [4]:
def score_partial_match(response, name):
    return td.jaro_winkler(response, name, prefix_weight=0.2)

def score_best_partial_match_any_rank(response, full_classification):
    best_score = 0
    best_match = None
    for rank, name in full_classification:
        score = score_partial_match(response, name)
        if score > best_score:
            best_score = score
            if score >= 1:
                return 1
    return best_score

row = responses.loc[300]

def nomer_partial_match_scores(row):
    taxon = row["taxon"]
    if taxon in alignment.index:
        classification = alignment.loc[row["taxon"]]["classification"]
        scores = list(map(lambda r: score_best_partial_match_any_rank(r, classification), row["responses"]))
    else:
        scores = list()

    return pd.Series({
        "partial match best": np.max(scores) if len(scores) > 0 else np.nan,
        "partial match mean": np.mean(scores),
        "partial match var": np.var(scores),
    })

nomer_partial_match_scores(row)

partial match best    1.000000
partial match mean    0.980635
partial match var     0.003000
dtype: float64

In [5]:
def nomer_exact_match(subject, rank, response, classification):
    if subject in alignment.index:
        return 1.0 if any([x[1] == response for x in classification if x[0] == rank]) else 0.0
    else:
        return np.nan

def nomer_exact_match_at_any_rank(subject, response, classification):
    if subject in alignment.index:
        return 1.0 if any([x[1] == response for x in classification]) else 0.0
    else:
        return np.nan

def nomer_rank_exact_match_scores(row):
    taxon = row["taxon"]
    if taxon in alignment.index:
        classification = alignment.loc[row["taxon"]]["classification"]
        scores = list(map(lambda r: nomer_exact_match(row["taxon"], row["object_rank"], r, classification), row["responses"]))
    else:
        scores = list()

    return pd.Series({
        "rank exact match best": np.max(scores) if len(scores) > 0 else np.nan,
        "rank exact match mean": np.mean(scores)
    })

def nomer_any_exact_match_scores(row):
    taxon = row["taxon"]
    if taxon in alignment.index:
        classification = alignment.loc[row["taxon"]]["classification"]
        scores = list(map(lambda r: nomer_exact_match_at_any_rank(row["taxon"], r, classification), row["responses"]))
    else:
        scores = list()

    return pd.Series({
        "any exact match best": np.max(scores) if len(scores) > 0 else np.nan,
        "any exact match mean": np.mean(scores)
    })

In [6]:
scores = scores.join(scores.apply(axis=1, func=nomer_partial_match_scores))
scores = scores.join(scores.apply(axis=1, func=nomer_rank_exact_match_scores))
scores = scores.join(scores.apply(axis=1, func=nomer_any_exact_match_scores))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [7]:
scores.head(3)

Unnamed: 0,subject_rank,object_rank,taxon,kingdom,phylum,class,order,family,genus,responses,garbage responses,partial match best,partial match mean,partial match var,rank exact match best,rank exact match mean,any exact match best,any exact match mean
0,phylum,kingdom,annelida,animalia,,,,,,"[animalia, animalia, animalia, animalia, anima...",0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
1,phylum,kingdom,arthropoda,animalia,,,,,,"[animalia, animalia, animalia, animalia, anima...",3,1.0,1.0,0.0,1.0,1.0,1.0,1.0
2,phylum,kingdom,brachiopoda,animalia,,,,,,"[animalia, animalia, animalia, animalia, anima...",0,1.0,1.0,0.0,1.0,1.0,1.0,1.0


In [8]:
only_scores = scores[scores["partial match best"] >= 0]

In [9]:
only_scores.groupby("subject_rank")[['garbage responses',
       'partial match best', 'partial match mean', 'partial match var',
       'rank exact match best', 'rank exact match mean',
       'any exact match best', 'any exact match mean']].mean().loc[['phylum', 'class', 'order', 'family', 'genus']]\
              .map(lambda x: f"{x:.2}")

Unnamed: 0_level_0,garbage responses,partial match best,partial match mean,partial match var,rank exact match best,rank exact match mean,any exact match best,any exact match mean
subject_rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
phylum,0.67,1.0,1.0,6.8e-06,1.0,0.99,1.0,0.99
class,1.4,0.99,0.98,0.00016,0.95,0.93,0.96,0.94
order,2.3,0.99,0.98,0.0017,0.89,0.83,0.93,0.89
family,2.8,0.98,0.98,0.00091,0.9,0.85,0.91,0.89
genus,3.0,0.97,0.97,0.00079,0.81,0.75,0.87,0.85


In [10]:
only_scores.groupby("object_rank")[['garbage responses',
       'partial match best', 'partial match mean', 'partial match var',
       'rank exact match best', 'rank exact match mean',
       'any exact match best', 'any exact match mean']].mean().loc[['kingdom', 'phylum', 'class', 'order', 'family']]\
              .map(lambda x: f"{x:.2}")

Unnamed: 0_level_0,garbage responses,partial match best,partial match mean,partial match var,rank exact match best,rank exact match mean,any exact match best,any exact match mean
object_rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
kingdom,1.3,0.98,0.97,0.0014,0.94,0.91,0.94,0.92
phylum,3.5,0.97,0.96,0.00076,0.68,0.59,0.84,0.81
class,3.5,0.97,0.97,0.00056,0.79,0.71,0.88,0.86
order,3.7,0.98,0.97,0.00052,0.81,0.76,0.86,0.84
family,3.5,0.98,0.97,0.00063,0.84,0.81,0.85,0.82


In [11]:
only_scores.groupby(["subject_rank", "object_rank"])[['rank exact match mean',
       'any exact match mean']].mean().sort_values("rank exact match mean", ascending=False)\
              .map(lambda x: f"{x:.0%}")

Unnamed: 0_level_0,Unnamed: 1_level_0,rank exact match mean,any exact match mean
subject_rank,object_rank,Unnamed: 2_level_1,Unnamed: 3_level_1
phylum,kingdom,99%,99%
family,kingdom,97%,100%
class,kingdom,95%,97%
order,phylum,93%,93%
genus,kingdom,91%,91%
class,phylum,90%,90%
family,phylum,84%,89%
family,class,82%,83%
genus,family,81%,82%
order,kingdom,77%,94%


In [46]:
summary = only_scores.groupby(["subject_rank", "taxon"])[['garbage responses',
       'partial match best', 'partial match mean', 'partial match var',
       'rank exact match best', 'rank exact match mean',
       'any exact match best', 'any exact match mean']].mean()
summary.map(lambda x: f"{x:.0%}")

Unnamed: 0_level_0,Unnamed: 1_level_0,garbage responses,partial match best,partial match mean,partial match var,rank exact match best,rank exact match mean,any exact match best,any exact match mean
subject_rank,taxon,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
class,actinopterygii,300%,100%,100%,0%,100%,100%,100%,100%
class,amphibia,150%,100%,100%,0%,100%,100%,100%,100%
class,angiospermae,0%,100%,100%,0%,100%,100%,100%,100%
class,anthozoa,0%,100%,100%,0%,100%,100%,100%,100%
class,articulata,50%,90%,90%,0%,50%,50%,50%,50%
...,...,...,...,...,...,...,...,...,...
phylum,mollusca,0%,100%,100%,0%,100%,100%,100%,100%
phylum,porifera,0%,100%,100%,0%,100%,100%,100%,100%
phylum,rhodophyta,200%,100%,100%,0%,100%,88%,100%,88%
phylum,spermatophyta,0%,100%,100%,0%,100%,100%,100%,100%


In [47]:
summary.to_csv(env.output_files.summary, sep="\t")