In [12]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from EvaluatorUtils import evaluate
import json

In [7]:
def plot_overview(results):
    # Flatten results into a DataFrame
    rows = []
    for doc, fields in results.items():
        for field, scores in fields.items():
            row = {"Document": doc, "Field": field}
            row.update(scores)
            rows.append(row)
    df = pd.DataFrame(rows)

    # Barplot: average metrics per field
    avg_df = df.groupby("Field")[["precision", "recall", "f1"]].mean().reset_index()
    avg_df = avg_df.melt(id_vars="Field", value_vars=["precision", "recall", "f1"], var_name="Metric", value_name="Score")

    fig_bar = px.bar(
        avg_df,
        x="Field",
        y="Score",
        color="Metric",
        barmode="group",
        title="Average Precision, Recall, F1 per Field",
        text_auto=".2f",
        range_y=[0, 1]
    )
    fig_bar.update_layout(yaxis_title="Score", legend_title="Metric")
    fig_bar.show()

    # Heatmap: F1 per document and field
    pivot = df.pivot(index="Document", columns="Field", values="f1")
    fig_heatmap = go.Figure(
        data=go.Heatmap(
            z=pivot.values,
            x=pivot.columns,
            y=pivot.index,
            colorscale="YlGnBu",
            colorbar=dict(title="F1 Score"),
            zmin=0, zmax=1,
            text=pivot.round(2).astype(str),
            hovertemplate="Document: %{y}<br>Field: %{x}<br>F1: %{z:.2f}<extra></extra>"
        )
    )
    fig_heatmap.update_layout(
        title="F1 Score per Document and Field",
        xaxis_title="Field",
        yaxis_title="Document",
        height=max(400, 30 * len(pivot))
    )
    fig_heatmap.show()

In [13]:
def get_all_model_names(pred_path):
    with open(pred_path, encoding="utf-8") as f:
        preds = json.load(f)
    model_names = set()
    for doc in preds:
        anns = doc.get("annotations", [])
        for ann in anns:
            if "model_name" in ann:
                model_names.add(ann["model_name"])
    return sorted(model_names)

In [19]:
gold_path = "input/gold_standard_events.json"
pred_path = "chat_responses_with_instructions.json"
model_names = get_all_model_names(pred_path)

all_results = {}
for model_name in model_names:
    print(f"\n=== Evaluation for model: {model_name} ===")
    results = evaluate(gold_path, pred_path, model_name=model_name)
    all_results[model_name] = results
    for doc, fields in results.items():
        print(f"\nDocument: {doc}")
        for field, scores in fields.items():
            print(f"{field}: P={scores['precision']:.2f} R={scores['recall']:.2f} F1={scores['f1']:.2f} (TP={scores['tp']} FP={scores['fp']} FN={scores['fn']})")
    plot_overview(results)


=== Evaluation for model: gemma3:12b ===

Document: file:/C:/Users/mnavas/CASE%20OF%20ALTAY%20v.%20TURKEY%20(No.%202).docx
event_type: P=1.00 R=1.00 F1=1.00 (TP=2 FP=0 FN=0)
event_who: P=0.00 R=0.00 F1=0.00 (TP=0 FP=4 FN=10)
event_what: P=0.25 R=0.06 F1=0.10 (TP=1 FP=3 FN=16)
event_when: P=0.33 R=0.05 F1=0.09 (TP=1 FP=2 FN=18)

Document: file:/C:/Users/mnavas/CASE%20OF%20BELYAYEV%20AND%20OTHERS%20v.%20UKRAINE.docx
event_type: P=1.00 R=1.00 F1=1.00 (TP=2 FP=0 FN=0)
event_who: P=0.12 R=0.12 F1=0.12 (TP=1 FP=7 FN=7)
event_what: P=0.17 R=0.14 F1=0.15 (TP=1 FP=5 FN=6)
event_when: P=0.17 R=0.17 F1=0.17 (TP=1 FP=5 FN=5)

Document: file:/C:/Users/mnavas/CASE%20OF%20BIGUN%20v.%20UKRAINE.docx
event_type: P=1.00 R=0.50 F1=0.67 (TP=1 FP=0 FN=1)
event_who: P=0.00 R=0.00 F1=0.00 (TP=0 FP=3 FN=8)
event_what: P=0.00 R=0.00 F1=0.00 (TP=0 FP=4 FN=13)
event_when: P=0.33 R=0.08 F1=0.13 (TP=1 FP=2 FN=11)

Document: file:/C:/Users/mnavas/CASE%20OF%20CABUCAK%20v.%20GERMANY.docx
event_type: P=0.00 R=0.00 F1=


=== Evaluation for model: mistral:latest ===

Document: file:/C:/Users/mnavas/CASE%20OF%20ALTAY%20v.%20TURKEY%20(No.%202).docx
event_type: P=1.00 R=1.00 F1=1.00 (TP=2 FP=0 FN=0)
event_who: P=0.67 R=0.20 F1=0.31 (TP=2 FP=1 FN=8)
event_what: P=0.20 R=0.06 F1=0.09 (TP=1 FP=4 FN=16)
event_when: P=0.33 R=0.05 F1=0.09 (TP=1 FP=2 FN=18)

Document: file:/C:/Users/mnavas/CASE%20OF%20BELYAYEV%20AND%20OTHERS%20v.%20UKRAINE.docx
event_type: P=1.00 R=1.00 F1=1.00 (TP=2 FP=0 FN=0)
event_who: P=0.20 R=0.12 F1=0.15 (TP=1 FP=4 FN=7)
event_what: P=0.00 R=0.00 F1=0.00 (TP=0 FP=7 FN=7)
event_when: P=0.00 R=0.00 F1=0.00 (TP=0 FP=3 FN=6)

Document: file:/C:/Users/mnavas/CASE%20OF%20BIGUN%20v.%20UKRAINE.docx
event_type: P=1.00 R=1.00 F1=1.00 (TP=2 FP=0 FN=0)
event_who: P=0.50 R=0.12 F1=0.20 (TP=1 FP=1 FN=7)
event_what: P=0.00 R=0.00 F1=0.00 (TP=0 FP=2 FN=13)
event_when: P=0.50 R=0.08 F1=0.14 (TP=1 FP=1 FN=11)

Document: file:/C:/Users/mnavas/CASE%20OF%20CABUCAK%20v.%20GERMANY.docx
event_type: P=0.00 R=0.00 