In [35]:
from pathlib import Path

from tax_form.eval.data import load_predicted_jsonl, load_ground_truth_json
from tax_form.eval.metrics import aggregate_metrics_across_documents

pred_path = Path("../data/output/")
true_path = Path("../data/target/")

In [36]:
pred = {x.stem: load_predicted_jsonl(x) for x in pred_path.glob("*.jsonl")}
gt = {x.stem: load_ground_truth_json(x) for x in true_path.glob("*.json")}

In [None]:
# Compute overall metrics across all documents

# Micro-averaging: overall metrics (sum all TP/FP/FN)
micro_metrics = aggregate_metrics_across_documents(
    pred, gt, match_type="exact", aggregation="micro"
)

print("=" * 60)
print("MICRO-AVERAGED METRICS (Overall - Exact Match)")
print("=" * 60)
print(f"Precision: {micro_metrics['precision']:.4f}")
print(f"Recall:    {micro_metrics['recall']:.4f}")
print(f"F1 Score:  {micro_metrics['f1']:.4f}")
print(f"\nTrue Positives:  {micro_metrics['true_positives']}")
print(f"False Positives: {micro_metrics['false_positives']}")
print(f"False Negatives: {micro_metrics['false_negatives']}")

print("\n")

# Macro-averaging: mean of per-document scores
macro_metrics = aggregate_metrics_across_documents(
    pred, gt, match_type="exact", aggregation="macro"
)

print("=" * 60)
print("MACRO-AVERAGED METRICS (Mean per Document - Exact Match)")
print("=" * 60)
print(f"Precision: {macro_metrics['precision']:.4f}")
print(f"Recall:    {macro_metrics['recall']:.4f}")
print(f"F1 Score:  {macro_metrics['f1']:.4f}")
print(f"\nNumber of Documents: {macro_metrics['num_documents']}")

MICRO-AVERAGED METRICS (Overall - Exact Match)
Precision: 0.8475
Recall:    0.7937
F1 Score:  0.8197

True Positives:  50
False Positives: 9
False Negatives: 13


MACRO-AVERAGED METRICS (Mean per Document - Exact Match)
Precision: 0.7726
Recall:    0.7464
F1 Score:  0.7571

Number of Documents: 10


Metrics are the same, but there are just no overlaps I guess.

In [None]:
# Compute overall metrics across all documents

# Micro-averaging: overall metrics (sum all TP/FP/FN)
micro_metrics = aggregate_metrics_across_documents(
    pred, gt, match_type="overlap", aggregation="micro"
)

print("=" * 60)
print("MICRO-AVERAGED METRICS (Overall - Overlap Match)")
print("=" * 60)
print(f"Precision: {micro_metrics['precision']:.4f}")
print(f"Recall:    {micro_metrics['recall']:.4f}")
print(f"F1 Score:  {micro_metrics['f1']:.4f}")
print(f"\nTrue Positives:  {micro_metrics['true_positives']}")
print(f"False Positives: {micro_metrics['false_positives']}")
print(f"False Negatives: {micro_metrics['false_negatives']}")

print("\n")

# Macro-averaging: mean of per-document scores
macro_metrics = aggregate_metrics_across_documents(
    pred, gt, match_type="overlap", aggregation="macro"
)

print("=" * 60)
print("MACRO-AVERAGED METRICS (Mean per Document - Overlap Match)")
print("=" * 60)
print(f"Precision: {macro_metrics['precision']:.4f}")
print(f"Recall:    {macro_metrics['recall']:.4f}")
print(f"F1 Score:  {macro_metrics['f1']:.4f}")
print(f"\nNumber of Documents: {macro_metrics['num_documents']}")

MICRO-AVERAGED METRICS (Overall - Overlap Match)
Precision: 0.8475
Recall:    0.7937
F1 Score:  0.8197

True Positives:  50
False Positives: 9
False Negatives: 13


MACRO-AVERAGED METRICS (Mean per Document - Overlap Match)
Precision: 0.7726
Recall:    0.7464
F1 Score:  0.7571

Number of Documents: 10
