# Annotating geometric hyponyms/hypernyms in the "Polygon" Wikipedia article 

In [99]:
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
import wikipedia

import pandas as pd
import numpy as np

from collections import defaultdict
from io import StringIO
import os
import hashlib

In [100]:
"""
Download the Polygon article (https://en.wikipedia.org/wiki/Polygon). Save it to a file.
If the file already exists and its contents differs from the freshly loaded wiki article, raise an error.
"""

POLYGON_ARTICLE_FILE_PATH = "polygon_wiki.txt"

# auto_suggest=False because https://stackoverflow.com/a/69886635
page = wikipedia.page(title="Polygon", auto_suggest=False, preload=True)
# print(page.content[:200], "\n\n", page.content[-200:])

if os.path.isfile(POLYGON_ARTICLE_FILE_PATH):
    with open(POLYGON_ARTICLE_FILE_PATH) as file:
        if file.read() != page.content:
            raise RuntimeError(f"The article has changed, check the file {POLYGON_ARTICLE_FILE_PATH}")
else:    
    with open(POLYGON_ARTICLE_FILE_PATH, mode="w") as file:
        file.write(page.content)

In [101]:
def chunk_wikipedia_article(
    article_text: str,
    chunk_size_limit: int = 2000,
    chunk_overlap: int = 200,
) -> list:  
    headers_to_split_on = [
        ("==", "header"),
    ]
    md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
    docs = md_splitter.split_text(article_text)

    def is_aux_section(d: str) -> bool:
        try:
            title = d.metadata["header"]
        except KeyError:
            return False
            
        return (
            title.startswith("See also")
            or title.startswith("Notes")
            or title.startswith("References")
            or title.startswith("External links")
        )

    docs = [d for d in docs if not is_aux_section(d)]

    rec_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size_limit,
        chunk_overlap=chunk_overlap,
    )
    docs = rec_splitter.split_documents(docs)
    
    return docs

def get_chunk_hash(chunk: str) -> str:
    return hashlib.md5(chunk.encode()).hexdigest()

In [102]:
"""
Load the Polygon Wikipedia article from file
and split it into chunks. 
"""

with open(POLYGON_ARTICLE_FILE_PATH) as file:
    article_text = file.read()

docs = chunk_wikipedia_article(article_text)
annotations = {}

## Go through the chunks (aka `docs`) and annotate each one.

In [103]:
import textwrap

def pretty_print(document) -> None:
    print(textwrap.fill(document.page_content))

In [104]:
i = 0
pretty_print(docs[i])
annotations[i] = [
    ("polygon", "plane figure"),
    ("closed polygonal chain", "polygon"),
    ("vertex", "point"),
    ("edge", "segment"),
    ("side", "segment"),
    ("n-gon", "polygon"),
    ("triangle", "n-gon"),
    ("3-gon", "n-gon"),
    ("triangle", "n-gon"),
    ("simple polygon", "boundary"),
    ("simple polygon", "polygon"),
    ("solid polygon", "region"),
    ("solid polygon", "polygon"),
    ("skew polygon", "polygon"),
    ("self-intersecting polygon", "polygon"),
    ("star polygon", "polygon"),
    ("polygon", "polytope"),
]

In geometry, a polygon () is a plane figure made up of line segments
connected to form a closed polygonal chain. The segments of a closed
polygonal chain are called its edges or sides. The points where two
edges meet are the polygon's vertices or corners. An n-gon is a
polygon with n sides; for example, a triangle is a 3-gon. A simple
polygon is one which does not intersect itself. More precisely, the
only allowed intersections among the line segments that make up the
polygon are the shared endpoints of consecutive segments in the
polygonal chain. A simple polygon is the boundary of a region of the
plane that is called a solid polygon. The interior of a solid polygon
is its body, also known as a polygonal region or polygonal area. In
contexts where one is concerned only with simple and solid polygons, a
polygon may refer only to a simple polygon or to a solid polygon. A
polygonal chain may cross over itself, creating star polygons and
other self-intersecting polygons. Some sources also

In [105]:
i = 1
pretty_print(docs[i])
annotations[i] = [
]
# No meaningful annotations. A short section on etymology

== Etymology == The word polygon derives from the Greek adjective
πολύς (polús) 'much', 'many' and γωνία (gōnía) 'corner' or 'angle'. It
has been suggested that γόνυ (gónu) 'knee' may be the origin of gon.


In [106]:
i = 2
pretty_print(docs[i])
# Different kinds of polygons
annotations[i] = [
    ("convex polygon", "polygon"),
    ("non-convex polygon", "polygon"),
    ("simple polygon", "polygon"),
    ("concave polygon", "polygon"),
    # ("Euclidian geometry", "geometry"),
    ("star-shaped polygon", "polygon"),
    ("self-intersecting polygon", "polygon"),
    ("star polygon polygon", "polygon"),
    ("equiangular polygon", "polygon"),
    ("equilateral polygon", "polygon"),
    ("regular polygon", "polygon"),
    ("cyclic polygon", "polygon"),
    ("tangential polygon", "polygon"),
    ("isogonal polygon", "polygon"),
    ("vertex-transitive polygon", "polygon"),
]

== Classification ==   === Number of sides === Polygons are primarily
classified by the number of sides.   === Convexity and intersection
=== Polygons may be characterized by their convexity or type of non-
convexity:   Convex: any line drawn through the polygon (and not
tangent to an edge or corner) meets its boundary exactly twice. As a
consequence, all its interior angles are less than 180°. Equivalently,
any line segment with endpoints on the boundary passes through only
interior points between its endpoints. This condition is true for
polygons in any geometry, not just Euclidean. Non-convex: a line may
be found which meets its boundary more than twice. Equivalently, there
exists a line segment between two boundary points that passes outside
the polygon. Simple: the boundary of the polygon does not cross
itself. All convex polygons are simple. Concave: Non-convex and
simple. There is at least one interior angle greater than 180°. Star-
shaped: the whole interior is visible from at 

In [107]:
i = 3
pretty_print(docs[i])
annotations[i] = [
    ("tangential polygon", "polygon"),
    ("isogonal polygon", "polygon"),
    ("isotoxal polygon", "polygon"),
    ("rectilinear polygon", "polygon"),
    # ??? Monotone with respect to a given line L --- too long to be included
]

Tangential: all sides are tangent to an inscribed circle. Isogonal or
vertex-transitive: all corners lie within the same symmetry orbit. The
polygon is also cyclic and equiangular. Isotoxal or edge-transitive:
all sides lie within the same symmetry orbit. The polygon is also
equilateral and tangential.The property of regularity may be defined
in other ways: a polygon is regular if and only if it is both isogonal
and isotoxal, or equivalently it is both cyclic and equilateral. A
non-convex regular polygon is called a regular star polygon.   ===
Miscellaneous === Rectilinear: the polygon's sides meet at right
angles, i.e. all its interior angles are 90 or 270 degrees. Monotone
with respect to a given line L: every line orthogonal to L intersects
the polygon not more than twice.


In [108]:
i = 4
pretty_print(docs[i])
annotations[i] = [
    # ("Euclidian geometry", "geometry"),
    ("interior angle", "angle"),
    ("exterior angle", "angle"),
    ("external polygon", "polygon"),
    ("external polygon", "polygon"),
    ("simple n-gon", "n-gon"),
    ("convex n-gon", "n-gon"),
    ("concave simple polygon", "polygon"),
    # pentagram, angular "eight" or antiparallelogram, turning number --- all without obvious hypernyms
]

== Properties and formulas == Euclidean geometry is assumed
throughout.   === Angles === Any polygon has as many corners as it has
sides. Each corner has several angles. The two most important ones
are:   Interior angle – The sum of the interior angles of a simple
n-gon is (n − 2) × π radians or (n − 2) × 180 degrees. This is because
any simple n-gon ( having n sides ) can be considered to be made up of
(n − 2) triangles, each of which has an angle sum of π radians or 180
degrees. The measure of any interior angle of a convex regular n-gon
is   (   1 −   2 n   )   π   {\displaystyle \left(1-{\tfrac
{2}{n}}\right)\pi } radians or   180 −   360 n   {\displaystyle
180-{\tfrac {360}{n}}} degrees. The interior angles of regular star
polygons were first studied by Poinsot, in the same paper in which he
describes the four regular star polyhedra: for a regular   p q
{\displaystyle {\tfrac {p}{q}}} -gon (a p-gon with central density q),
each interior angle is   π ( p − 2 q )   p   {\displaystyl

In [109]:
i = 5
pretty_print(docs[i])
annotations[i] = [
    ("simple polygon", "polygon"),
    ("non-self-intersecting polygon", "polygon"),
    # ("signed area", "area")
    ("exterior angle", "angle"),
]

=== Area === In this section, the vertices of the polygon under
consideration are taken to be   (   x   0   ,   y   0   ) , (   x   1
,   y   1   ) , … , (   x   n − 1   ,   y   n − 1   )   {\displaystyle
(x_{0},y_{0}),(x_{1},y_{1}),\ldots ,(x_{n-1},y_{n-1})} in order. For
convenience in some formulas, the notation (xn, yn) = (x0, y0) will
also be used.   ==== Simple polygons ====   If the polygon is non-
self-intersecting (that is, simple), the signed area is   A =   1 2
∑   i = 0   n − 1   (   x   i   y   i + 1   −   x   i + 1   y   i   )
where   x   n   =   x   0   and   y   n   =   y   0   ,
{\displaystyle A={\frac {1}{2}}\sum
_{i=0}^{n-1}(x_{i}y_{i+1}-x_{i+1}y_{i})\quad {\text{where
}}x_{n}=x_{0}{\text{ and }}y_{n}=y_{0},} or, using determinants   16
A   2   =   ∑   i = 0   n − 1   ∑   j = 0   n − 1   |   Q   i , j   Q
i , j + 1   Q   i + 1 , j   Q   i + 1 , j + 1   |   ,   {\displaystyle
16A^{2}=\sum _{i=0}^{n-1}\sum _{j=0}^{n-
1}{\begin{vmatrix}Q_{i,j}&Q_{i,j+1}\\Q_{i+1,j}&Q_{i+

In [110]:
i = 6
pretty_print(docs[i])
# Problem: polygon is same as n-gon, they are used interchangeably
annotations[i] = [
    ("interior grid points", "grid points"),
    ("boundary grid points", "grid points"),
    ("simple polygon", "polygon"),
    ("cyclic n-gon", "n-gon"),
    ("regular n-gon", "n-gon"),
    ("regular polygon", "polygon"),
    ("cyclic polygon", "polygon"),
    ("inscribed circle", "circle"),
]

A =   1 2   (   a   1   [   a   2   sin ⁡ (   θ   1   ) +   a   3
sin ⁡ (   θ   1   +   θ   2   ) + ⋯ +   a   n − 1   sin ⁡ (   θ   1
+   θ   2   + ⋯ +   θ   n − 2   ) ]   +   a   2   [   a   3   sin ⁡ (
θ   2   ) +   a   4   sin ⁡ (   θ   2   +   θ   3   ) + ⋯ +   a   n −
1   sin ⁡ (   θ   2   + ⋯ +   θ   n − 2   ) ]   + ⋯ +   a   n − 2   [
a   n − 1   sin ⁡ (   θ   n − 2   ) ] ) .   {\displaystyle
{\begin{aligned}A={\frac {1}{2}}(a_{1}[a_{2}\sin(\theta
_{1})+a_{3}\sin(\theta _{1}+\theta _{2})+\cdots +a_{n-1}\sin(\theta
_{1}+\theta _{2}+\cdots +\theta _{n-2})]\\{}+a_{2}[a_{3}\sin(\theta
_{2})+a_{4}\sin(\theta _{2}+\theta _{3})+\cdots +a_{n-1}\sin(\theta
_{2}+\cdots +\theta _{n-2})]\\{}+\cdots +a_{n-2}[a_{n-1}\sin(\theta
_{n-2})]).\end{aligned}}} The formula was described by Lopshits in
1963.If the polygon can be drawn on an equally spaced grid such that
all its vertices are grid points, Pick's theorem gives a simple
formula for the polygon's area based on the numbers of interior and
b

In [111]:
i = 7
pretty_print(docs[i])
annotations[i] = [
    ("regular polygon", "polygon"),
    # ("inscribed circle", "circle"),
    ("regular n-gon", "n-gon"),
    # ("unit-radius", "circle"),
    ("self-intersecting polygon", "polygon"),
    # central convex pentagon
    # center of a pentagram, density
    # triangular regions,
    # cross-quadrilateral, figure 8
    # have opposite-signed densities
    # whole figure
]

The area of a regular polygon is given in terms of the radius r of its
inscribed circle and its perimeter p by   A =   1 2   ⋅ p ⋅ r .
{\displaystyle A={\tfrac {1}{2}}\cdot p\cdot r.} This radius is also
termed its apothem and is often represented as a. The area of a
regular n-gon in terms of the radius R of its circumscribed circle can
be expressed trigonometrically as:   A =   R   2   ⋅   n 2   ⋅ sin ⁡
2 π   n   =   R   2   ⋅ n ⋅ sin ⁡   π n   ⋅ cos ⁡   π n
{\displaystyle A=R^{2}\cdot {\frac {n}{2}}\cdot \sin {\frac {2\pi
}{n}}=R^{2}\cdot n\cdot \sin {\frac {\pi }{n}}\cdot \cos {\frac {\pi
}{n}}} The area of a regular n-gon inscribed in a unit-radius circle,
with side s and interior angle   α ,   {\displaystyle \alpha ,} can
also be expressed trigonometrically as:   A =   n   s   2   4   cot ⁡
π n   =   n   s   2   4   cot ⁡   α   n − 2   = n ⋅ sin ⁡   α   n − 2
⋅ cos ⁡   α   n − 2   .   {\displaystyle A={\frac {ns^{2}}{4}}\cot
{\frac {\pi }{n}}={\frac {ns^{2}}{4}}\cot {\frac {\alpha

In [112]:
i = 8
pretty_print(docs[i])
annotations[i] = [
    # enclosed region, point set
    # simple polygon
    # self-intersecting polygon
    # cross-quadrilateral
    # simple triangles
]

Considering the enclosed regions as point sets, we can find the area
of the enclosed point set. This corresponds to the area of the plane
covered by the polygon or to the area of one or more simple polygons
having the same outline as the self-intersecting one. In the case of
the cross-quadrilateral, it is treated as two simple triangles.   ===
Centroid === Using the same convention for vertex coordinates as in
the previous section, the coordinates of the centroid of a solid
simple polygon are   C   x   =   1   6 A   ∑   i = 0   n − 1   (   x
i   +   x   i + 1   ) (   x   i   y   i + 1   −   x   i + 1   y   i
) ,   {\displaystyle C_{x}={\frac {1}{6A}}\sum
_{i=0}^{n-1}(x_{i}+x_{i+1})(x_{i}y_{i+1}-x_{i+1}y_{i}),}   C   y   =
1   6 A   ∑   i = 0   n − 1   (   y   i   +   y   i + 1   ) (   x   i
y   i + 1   −   x   i + 1   y   i   ) .   {\displaystyle C_{y}={\frac
{1}{6A}}\sum _{i=0}^{n-1}(y_{i}+y_{i+1})(x_{i}y_{i+1}-x_{i+1}y_{i}).}
In these formulas, the signed value of area   A   {\displa

In [113]:
i = 9
pretty_print(docs[i])
annotations[i] = [
    ("spherical polygon", "polygon"),
    ("digon", "polygon"),
    # flat plane, uniform polyhedra, Wythoff's construction
    ("skew polygon", "polygon"),
    ("Petrie polygon", "polygon"),
    # ("apeirogon", "polygon")
    ("skew apeirogon", "apeirogon"),
    ("polygon with holes", "polygon"),
    ("area-connected polygon", "polygon"),
    ("multiply-connected polygon", "polygon"),
    ("planar polygon", "polygon"),
    ("complex polygon", "polygon"),
    ("abstract polygon", "polygon"),
    ("polyhedron", "three-dimensional solid"),
    # polytope, polyhedron
]

== Generalizations == The idea of a polygon has been generalized in
various ways. Some of the more important include:   A spherical
polygon is a circuit of arcs of great circles (sides) and vertices on
the surface of a sphere. It allows the digon, a polygon having only
two sides and two corners, which is impossible in a flat plane.
Spherical polygons play an important role in cartography (map making)
and in Wythoff's construction of the uniform polyhedra. A skew polygon
does not lie in a flat plane, but zigzags in three (or more)
dimensions. The Petrie polygons of the regular polytopes are well
known examples. An apeirogon is an infinite sequence of sides and
angles, which is not closed but has no ends because it extends
indefinitely in both directions. A skew apeirogon is an infinite
sequence of sides and angles that do not lie in a flat plane. A
polygon with holes is an area-connected or multiply-connected planar
polygon with one external boundary and one or more interior boundaries


In [114]:
i = 10
pretty_print(docs[i])
annotations[i] = [
    ("pentagon", "polygon"),
    ("dodecagon", "polygon"),
    ("triangle", "polygon"),
    ("quadrilateral", "polygon"),
    ("nonagon", "polygon"),
    ("decagon", "polygon"),
    ("pentagram", "polygon"),
    # quasiregular polyhedra
]

== Naming == The word polygon comes from Late Latin polygōnum (a
noun), from Greek πολύγωνον (polygōnon/polugōnon), noun use of neuter
of πολύγωνος (polygōnos/polugōnos, the masculine adjective), meaning
"many-angled". Individual polygons are named (and sometimes
classified) according to the number of sides, combining a Greek-
derived numerical prefix with the suffix -gon, e.g. pentagon,
dodecagon. The triangle, quadrilateral and nonagon are exceptions.
Beyond decagons (10-sided) and dodecagons (12-sided), mathematicians
generally use numerical notation, for example 17-gon and
257-gon.Exceptions exist for side counts that are easily expressed in
verbal form (e.g. 20 and 30), or are used by non-mathematicians. Some
special polygons also have their own names; for example the regular
star pentagon is also known as the pentagram.   To construct the name
of a polygon with more than 20 and fewer than 100 edges, combine the
prefixes as follows. The "kai" term applies to 13-gons and higher and

In [115]:
i = 11
pretty_print(docs[i])
annotations[i] = [
    ("regular polygon", "polygon"),
    ("pentagram", "non-convex regular polygon"),
    ("non-convex regular polygon", "polygon"),
    ("star polygon", "polygon"),
    ("non-convex polygon", "polygon"),
    ("real dimension", "dimension"),
    ("imaginary dimension", "dimension"),
    ("complex polygon", "polygon"),
]

== History == Polygons have been known since ancient times. The
regular polygons were known to the ancient Greeks, with the pentagram,
a non-convex regular polygon (star polygon), appearing as early as the
7th century B.C. on a krater by Aristophanes, found at Caere and now
in the Capitoline Museum.The first known systematic study of non-
convex polygons in general was made by Thomas Bradwardine in the 14th
century.In 1952, Geoffrey Colin Shephard generalized the idea of
polygons to the complex plane, where each real dimension is
accompanied by an imaginary one, to create complex polygons.


In [116]:
i = 12
pretty_print(docs[i])
annotations[i] = [
    ("regular hexagon", "hexagon"),
    # array of hexagons
    # surface
    # sides and base of each cell are poolygons
]

== In nature == Polygons appear in rock formations, most commonly as
the flat facets of crystals, where the angles between the sides depend
on the type of mineral from which the crystal is made. Regular
hexagons can occur when the cooling of lava forms areas of tightly
packed columns of basalt, which may be seen at the Giant's Causeway in
Northern Ireland, or at the Devil's Postpile in California. In
biology, the surface of the wax honeycomb made by bees is an array of
hexagons, and the sides and base of each cell are also polygons.


In [117]:
i = 13
pretty_print(docs[i])
annotations[i] = [
    ("polygon", "primitive"),
    ("polygon mesh", "tesselation"),
    # polygons are two-dimensional
    # simple polygon
    # line segments
    # arrays of vertices (the coordinates of the geometrical vertices,
    # as well as other attributes of the polygon, such as color, shading and texture)
]

== Computer graphics ==   In computer graphics, a polygon is a
primitive used in modelling and rendering. They are defined in a
database, containing arrays of vertices (the coordinates of the
geometrical vertices, as well as other attributes of the polygon, such
as color, shading and texture), connectivity information, and
materials.Any surface is modelled as a tessellation called polygon
mesh. If a square mesh has n + 1 points (vertices) per side, there are
n squared squares in the mesh, or 2n squared triangles since there are
two triangles in a square. There are (n + 1)2 / 2(n2) vertices per
triangle. Where n is large, this approaches one half. Or, each vertex
inside the square mesh connects four edges (lines). The imaging system
calls up the structure of polygons needed for the scene to be created
from the database. This is transferred to active memory and finally,
to the display system (screen, TV monitors etc.) so that the scene can
be viewed. During this process, the imaging syst

## Observations

- there are chunks from which no meaningful pairs can be extracted
- most annotated pairs are of the form (\<adjective\> X, X), e.g. `("simple polygon", "polygon")`
- if a phrase containing "\<adjective\> X" appears and X is a valid term without the adjective, then I require (\<adjective\> X, X) to be retrieved by the LLM, even though X might be used within the chunk with this adjective only
- there are synonymous terms like n-gon and polygon that make annotations dubious for naive metric calculation. I don't count those as hyponym/hypernym pairs.

In [118]:
all_annots_in_one_list = set()
for anns in annotations.values():
    all_annots_in_one_list = all_annots_in_one_list.union(anns)

all_annots_in_one_list

{('3-gon', 'n-gon'),
 ('Petrie polygon', 'polygon'),
 ('abstract polygon', 'polygon'),
 ('area-connected polygon', 'polygon'),
 ('boundary grid points', 'grid points'),
 ('closed polygonal chain', 'polygon'),
 ('complex polygon', 'polygon'),
 ('concave polygon', 'polygon'),
 ('concave simple polygon', 'polygon'),
 ('convex n-gon', 'n-gon'),
 ('convex polygon', 'polygon'),
 ('cyclic n-gon', 'n-gon'),
 ('cyclic polygon', 'polygon'),
 ('decagon', 'polygon'),
 ('digon', 'polygon'),
 ('dodecagon', 'polygon'),
 ('edge', 'segment'),
 ('equiangular polygon', 'polygon'),
 ('equilateral polygon', 'polygon'),
 ('exterior angle', 'angle'),
 ('external polygon', 'polygon'),
 ('imaginary dimension', 'dimension'),
 ('inscribed circle', 'circle'),
 ('interior angle', 'angle'),
 ('interior grid points', 'grid points'),
 ('isogonal polygon', 'polygon'),
 ('isotoxal polygon', 'polygon'),
 ('multiply-connected polygon', 'polygon'),
 ('n-gon', 'polygon'),
 ('non-convex polygon', 'polygon'),
 ('non-convex r

In [126]:
"""
Save the annotations to disk for future reuse
"""
import json

annotations_sorted = [annot for idx, annot in sorted(annotations.items())]
chunks = [d.page_content for d in docs]
chunks_with_pairs = [
    {
        "chunk" : c,
        "x_is_a_y" : annot 
     } for c, annot in zip(chunks, annotations_sorted)
]
to_save = {"annotated_chunks" : chunks_with_pairs}

with open("annotations.json", "w") as file:
    json.dump(to_save, file)

In [131]:
with open("annotations.json") as file:
    js = json.load(file)

print(str(js)[:300])

{'annotated_chunks': [{'chunk': "In geometry, a polygon () is a plane figure made up of line segments connected to form a closed polygonal chain.\nThe segments of a closed polygonal chain are called its edges or sides. The points where two edges meet are the polygon's vertices or corners. An n-gon i


## Example: how to calculate the metrics

Suppose the LLM read a chunk of Wikipedia text and returned a set of pairs for this chunk that is similar to my annotations above.
How do we compare the LLM's answer to the fuzzy ground truth?
Denote the LLM answers $\{(l_i, L_i)\}_{i=1}^n$ ($l_i$ is a kind of $L_i$) and the ground truth by and $\{(t_i, T_i)\}_{i=1}^m$

Here's an outline of the algorithm plus some ideas for the future:

1. validate both sets of pairs for duplicates and other trivially detectable inconsistencies

2. convert everything to lowercase and strip of spaces

3. find exact matches, count them and exclude them from the sets.

4. Calculate the "recall" metric, i.e. the fraction of the ground truth samples that are covered by the LLM answer.

5. Calculate the "trash rate" metric, i.e. $\frac{n-C}{n}$ where we have $n$ pairs from the LLM and $C$ of them match some pairs from the ground truth.

6. How can we aggregate the results when we've calculated the metrics for each of the N chunks? We can simply concatenate the lists of pairs for them and calculate recall and trash rate for the whole article.
We can also keep the metrics for each chunk and average them, perhaps weighted by the number of ground truth pairs in each chunk.
It's useful to check which chunks were least successful, what the typical problems were.

7. MAYBE LATER: find partial matches by pairing $(l_i, L_i)$ with $(t_j, T_j)$ if $d(l_i, t_j) < D$ and $d(l_i, t_j) < D$ for some distance $d$ and threshold $D$ (e.g. edit distance of 2). This might account for slight variations such as apostrophe or spelling mistakes.

8. MAYBE LATER: go over unmatched LLM answers and check for skipped elements in the hypernym chain, i.e. (rhombus, polygon) is a partially correct answer if the ground truth contains both (rhombus, quadrilateral) and (quadrilateral, polygon) 

9. MAYBE LATER: For each wrong answer (not a match), find a closest unpaired ground truth answer. That might ease debugging.

10. MAYBE LATER: use LLM as a judge.

11. MAYBE LATER: support annotations with a list of equivalent acceptable pairs (e.g. acceptable spelling variations)

12. MAYBE LATER: support annotations with possible but undesirable answers to catch wrong answers

13. MAYBE LATER: add confidence scores for each ground truth pair, i.e. how important is it for the LLM to retrieve this pair. 

In [132]:
# Some sample data to calculate the metrics on.
ground_truth = [
    ("interior angle", "angle"),
    ("exterior angle", "angle"),
    ("regular polygon", "polygon"),
    ("cyclic polygon", "polygon"),
]
llm_answers = [
    ("interior angle", "angle"),
    ("cyclic polygon", "polygon"),
    ("bogus example", "polygon"),
    ("regular_polygon", "angle"),
]

# Convert to lowercase, strip
ground_truth = [(t.lower().strip(), T.lower().strip()) for t, T in ground_truth]
llm_answers = [(l.lower().strip(), L.lower().strip()) for l, L in llm_answers]

# Validation step (skipped)
# Duplicates removed when set is constructed.
ground_truth = set(ground_truth)
llm_answers = set(llm_answers)

exact_matches = set()
for gt_pair in ground_truth:
    if gt_pair in llm_answers:
        exact_matches.add(gt_pair)

wrong_answers = llm_answers.difference(exact_matches)
unpaired_ground_truth = ground_truth.difference(exact_matches)

if len(ground_truth) == 0:
    recall = 1
else:
    recall = len(unpaired_ground_truth) / len(ground_truth)

if len(llm_answers) == 0:
    trash_rate = 0
else:
    trash_rate = len(wrong_answers) / len(llm_answers)

print(f"{recall = }, {trash_rate = }")
print(f"{exact_matches = }")
print(f"{wrong_answers = }")
print(f"{unpaired_ground_truth = }")

metrics = {
    "exact_matches" : exact_matches,
    "wrong_answers" : wrong_answers,
    "unpaired_ground_truth" : unpaired_ground_truth,
    "recall" : recall,
    "trash_rate" : trash_rate,
}

recall = 0.5, trash_rate = 0.5
exact_matches = {('cyclic polygon', 'polygon'), ('interior angle', 'angle')}
wrong_answers = {('bogus example', 'polygon'), ('regular_polygon', 'angle')}
unpaired_ground_truth = {('exterior angle', 'angle'), ('regular polygon', 'polygon')}
