In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from EvaluatorUtils import evaluate
import json

In [2]:
def plot_overview(results):
    # Flatten results into a DataFrame
    rows = []
    for doc, fields in results.items():
        for field, scores in fields.items():
            row = {"Document": doc, "Field": field}
            row.update(scores)
            rows.append(row)
    df = pd.DataFrame(rows)

    # Barplot: average metrics per field
    avg_df = df.groupby("Field")[["precision", "recall", "f1"]].mean().reset_index()
    avg_df = avg_df.melt(id_vars="Field", value_vars=["precision", "recall", "f1"], var_name="Metric", value_name="Score")

    fig_bar = px.bar(
        avg_df,
        x="Field",
        y="Score",
        color="Metric",
        barmode="group",
        title="Average Precision, Recall, F1 per Field",
        text_auto=".2f",
        range_y=[0, 1]
    )
    fig_bar.update_layout(yaxis_title="Score", legend_title="Metric")
    fig_bar.show()

    # Heatmap: F1 per document and field
    pivot = df.pivot(index="Document", columns="Field", values="f1")
    fig_heatmap = go.Figure(
        data=go.Heatmap(
            z=pivot.values,
            x=pivot.columns,
            y=pivot.index,
            colorscale="YlGnBu",
            colorbar=dict(title="F1 Score"),
            zmin=0, zmax=1,
            text=pivot.round(2).astype(str),
            hovertemplate="Document: %{y}<br>Field: %{x}<br>F1: %{z:.2f}<extra></extra>"
        )
    )
    fig_heatmap.update_layout(
        title="F1 Score per Document and Field",
        xaxis_title="Field",
        yaxis_title="Document",
        height=max(400, 30 * len(pivot))
    )
    fig_heatmap.show()

In [3]:
def get_all_model_names(pred_path):
    with open(pred_path, encoding="utf-8") as f:
        preds = json.load(f)
    model_names = set()
    for doc in preds:
        anns = doc.get("annotations", [])
        for ann in anns:
            if "model_name" in ann:
                model_names.add(ann["model_name"])
    return sorted(model_names)

In [4]:
gold_path = "input/gold_standard_events.json"
pred_path = "output/270525/chat_responses_with_instructions.json"
model_names = get_all_model_names(pred_path)

all_results = {}
for model_name in model_names:
    print(f"\n=== Evaluation for model: {model_name} ===")
    results = evaluate(gold_path, pred_path, model_name=model_name)
    all_results[model_name] = results
    for doc, fields in results.items():
        print(f"\nDocument: {doc}")
        for field, scores in fields.items():
            print(f"{field}: P={scores['precision']:.2f} R={scores['recall']:.2f} F1={scores['f1']:.2f} (TP={scores['tp']} FP={scores['fp']} FN={scores['fn']})")
    plot_overview(results)


=== Evaluation for model: chevalblanc/claude-3-haiku:latest ===

Document: file:/C:/Users/mnavas/CASE%20OF%20ALTAY%20v.%20TURKEY%20(No.%202).docx
event_type: P=1.00 R=1.00 F1=1.00 (TP=2 FP=0 FN=0)
event_who: P=0.11 R=0.10 F1=0.11 (TP=1 FP=8 FN=9)
event_what: P=0.00 R=0.00 F1=0.00 (TP=0 FP=23 FN=17)
event_when: P=0.67 R=0.74 F1=0.70 (TP=14 FP=7 FN=5)

Document: file:/C:/Users/mnavas/CASE%20OF%20BELYAYEV%20AND%20OTHERS%20v.%20UKRAINE.docx
event_type: P=1.00 R=1.00 F1=1.00 (TP=2 FP=0 FN=0)
event_who: P=0.20 R=0.12 F1=0.15 (TP=1 FP=4 FN=7)
event_what: P=0.00 R=0.00 F1=0.00 (TP=0 FP=13 FN=7)
event_when: P=0.33 R=0.50 F1=0.40 (TP=3 FP=6 FN=3)

Document: file:/C:/Users/mnavas/CASE%20OF%20BIGUN%20v.%20UKRAINE.docx
event_type: P=1.00 R=1.00 F1=1.00 (TP=2 FP=0 FN=0)
event_who: P=0.00 R=0.00 F1=0.00 (TP=0 FP=5 FN=8)
event_what: P=0.00 R=0.00 F1=0.00 (TP=0 FP=11 FN=13)
event_when: P=0.36 R=0.33 F1=0.35 (TP=4 FP=7 FN=8)

Document: file:/C:/Users/mnavas/CASE%20OF%20CABUCAK%20v.%20GERMANY.docx
event


=== Evaluation for model: deepseek-r1:8b ===

Document: file:/C:/Users/mnavas/CASE%20OF%20ALTAY%20v.%20TURKEY%20(No.%202).docx
event_type: P=1.00 R=0.50 F1=0.67 (TP=1 FP=0 FN=1)
event_who: P=0.50 R=0.40 F1=0.44 (TP=4 FP=4 FN=6)
event_what: P=0.15 R=0.12 F1=0.13 (TP=2 FP=11 FN=15)
event_when: P=0.70 R=0.37 F1=0.48 (TP=7 FP=3 FN=12)

Document: file:/C:/Users/mnavas/CASE%20OF%20BELYAYEV%20AND%20OTHERS%20v.%20UKRAINE.docx
event_type: P=0.67 R=1.00 F1=0.80 (TP=2 FP=1 FN=0)
event_who: P=0.29 R=0.50 F1=0.36 (TP=4 FP=10 FN=4)
event_what: P=0.00 R=0.00 F1=0.00 (TP=0 FP=22 FN=7)
event_when: P=0.00 R=0.00 F1=0.00 (TP=0 FP=15 FN=6)

Document: file:/C:/Users/mnavas/CASE%20OF%20BIGUN%20v.%20UKRAINE.docx
event_type: P=0.00 R=0.00 F1=0.00 (TP=0 FP=2 FN=2)
event_who: P=0.14 R=0.12 F1=0.13 (TP=1 FP=6 FN=7)
event_what: P=0.08 R=0.08 F1=0.08 (TP=1 FP=11 FN=12)
event_when: P=0.62 R=0.42 F1=0.50 (TP=5 FP=3 FN=7)

Document: file:/C:/Users/mnavas/CASE%20OF%20CABUCAK%20v.%20GERMANY.docx
event_type: P=0.00 R=0


=== Evaluation for model: gemma3:12b ===

Document: file:/C:/Users/mnavas/CASE%20OF%20ALTAY%20v.%20TURKEY%20(No.%202).docx
event_type: P=1.00 R=1.00 F1=1.00 (TP=2 FP=0 FN=0)
event_who: P=0.88 R=0.70 F1=0.78 (TP=7 FP=1 FN=3)
event_what: P=0.19 R=0.18 F1=0.18 (TP=3 FP=13 FN=14)
event_when: P=0.75 R=0.47 F1=0.58 (TP=9 FP=3 FN=10)

Document: file:/C:/Users/mnavas/CASE%20OF%20BELYAYEV%20AND%20OTHERS%20v.%20UKRAINE.docx
event_type: P=1.00 R=1.00 F1=1.00 (TP=2 FP=0 FN=0)
event_who: P=0.25 R=0.38 F1=0.30 (TP=3 FP=9 FN=5)
event_what: P=0.07 R=0.14 F1=0.10 (TP=1 FP=13 FN=6)
event_when: P=0.17 R=0.17 F1=0.17 (TP=1 FP=5 FN=5)

Document: file:/C:/Users/mnavas/CASE%20OF%20BIGUN%20v.%20UKRAINE.docx
event_type: P=1.00 R=1.00 F1=1.00 (TP=2 FP=0 FN=0)
event_who: P=0.38 R=0.38 F1=0.38 (TP=3 FP=5 FN=5)
event_what: P=0.12 R=0.15 F1=0.13 (TP=2 FP=15 FN=11)
event_when: P=0.42 R=0.42 F1=0.42 (TP=5 FP=7 FN=7)

Document: file:/C:/Users/mnavas/CASE%20OF%20CABUCAK%20v.%20GERMANY.docx
event_type: P=0.00 R=0.00 F1


=== Evaluation for model: incept5/llama3.1-claude:latest ===

Document: file:/C:/Users/mnavas/CASE%20OF%20ALTAY%20v.%20TURKEY%20(No.%202).docx
event_type: P=1.00 R=1.00 F1=1.00 (TP=2 FP=0 FN=0)
event_who: P=0.67 R=0.60 F1=0.63 (TP=6 FP=3 FN=4)
event_what: P=0.33 R=0.24 F1=0.28 (TP=4 FP=8 FN=13)
event_when: P=0.83 R=0.79 F1=0.81 (TP=15 FP=3 FN=4)

Document: file:/C:/Users/mnavas/CASE%20OF%20BELYAYEV%20AND%20OTHERS%20v.%20UKRAINE.docx
event_type: P=0.00 R=0.00 F1=0.00 (TP=0 FP=0 FN=2)
event_who: P=0.00 R=0.00 F1=0.00 (TP=0 FP=0 FN=8)
event_what: P=0.00 R=0.00 F1=0.00 (TP=0 FP=0 FN=7)
event_when: P=0.00 R=0.00 F1=0.00 (TP=0 FP=0 FN=6)

Document: file:/C:/Users/mnavas/CASE%20OF%20BIGUN%20v.%20UKRAINE.docx
event_type: P=1.00 R=1.00 F1=1.00 (TP=2 FP=0 FN=0)
event_who: P=0.00 R=0.00 F1=0.00 (TP=0 FP=8 FN=8)
event_what: P=0.05 R=0.08 F1=0.06 (TP=1 FP=19 FN=12)
event_when: P=0.57 R=0.33 F1=0.42 (TP=4 FP=3 FN=8)

Document: file:/C:/Users/mnavas/CASE%20OF%20CABUCAK%20v.%20GERMANY.docx
event_type


=== Evaluation for model: llama3.3:latest ===

Document: file:/C:/Users/mnavas/CASE%20OF%20ALTAY%20v.%20TURKEY%20(No.%202).docx
event_type: P=1.00 R=1.00 F1=1.00 (TP=2 FP=0 FN=0)
event_who: P=0.12 R=0.10 F1=0.11 (TP=1 FP=7 FN=9)
event_what: P=0.00 R=0.00 F1=0.00 (TP=0 FP=21 FN=17)
event_when: P=0.71 R=0.79 F1=0.75 (TP=15 FP=6 FN=4)

Document: file:/C:/Users/mnavas/CASE%20OF%20BELYAYEV%20AND%20OTHERS%20v.%20UKRAINE.docx
event_type: P=1.00 R=1.00 F1=1.00 (TP=2 FP=0 FN=0)
event_who: P=0.17 R=0.12 F1=0.14 (TP=1 FP=5 FN=7)
event_what: P=0.00 R=0.00 F1=0.00 (TP=0 FP=12 FN=7)
event_when: P=0.25 R=0.33 F1=0.29 (TP=2 FP=6 FN=4)

Document: file:/C:/Users/mnavas/CASE%20OF%20BIGUN%20v.%20UKRAINE.docx
event_type: P=1.00 R=1.00 F1=1.00 (TP=2 FP=0 FN=0)
event_who: P=0.14 R=0.12 F1=0.13 (TP=1 FP=6 FN=7)
event_what: P=0.08 R=0.08 F1=0.08 (TP=1 FP=11 FN=12)
event_when: P=0.50 R=0.50 F1=0.50 (TP=6 FP=6 FN=6)

Document: file:/C:/Users/mnavas/CASE%20OF%20CABUCAK%20v.%20GERMANY.docx
event_type: P=1.00 R=0.


=== Evaluation for model: mistral:latest ===

Document: file:/C:/Users/mnavas/CASE%20OF%20ALTAY%20v.%20TURKEY%20(No.%202).docx
event_type: P=1.00 R=1.00 F1=1.00 (TP=2 FP=0 FN=0)
event_who: P=0.14 R=0.10 F1=0.12 (TP=1 FP=6 FN=9)
event_what: P=0.10 R=0.06 F1=0.07 (TP=1 FP=9 FN=16)
event_when: P=0.50 R=0.05 F1=0.10 (TP=1 FP=1 FN=18)

Document: file:/C:/Users/mnavas/CASE%20OF%20BELYAYEV%20AND%20OTHERS%20v.%20UKRAINE.docx
event_type: P=1.00 R=1.00 F1=1.00 (TP=2 FP=0 FN=0)
event_who: P=0.11 R=0.12 F1=0.12 (TP=1 FP=8 FN=7)
event_what: P=0.00 R=0.00 F1=0.00 (TP=0 FP=14 FN=7)
event_when: P=0.14 R=0.17 F1=0.15 (TP=1 FP=6 FN=5)

Document: file:/C:/Users/mnavas/CASE%20OF%20BIGUN%20v.%20UKRAINE.docx
event_type: P=1.00 R=1.00 F1=1.00 (TP=2 FP=0 FN=0)
event_who: P=0.17 R=0.12 F1=0.14 (TP=1 FP=5 FN=7)
event_what: P=0.10 R=0.08 F1=0.09 (TP=1 FP=9 FN=12)
event_when: P=0.56 R=0.42 F1=0.48 (TP=5 FP=4 FN=7)

Document: file:/C:/Users/mnavas/CASE%20OF%20CABUCAK%20v.%20GERMANY.docx
event_type: P=0.00 R=0.00 