# Flair Evaluation - 02 -

In diesem notebook wird untersucht wie gut flair die gewünschten Entitäten (EVENT, TOPIC, DATE, TIME, LOC) erkennt.

Es wird folgendes Model genutzt: **"ner_german_legal"**   
F1-Score: 96,35 (LER German dataset)

Predicts 19 tags:  
kein klassisches LOC, aber ST für Stadt und STR für Straße

 
Die Performance von Flair wird auf dem ground truth untersucht. 

#### note: Es wird bei der Evaluation nur LOC betrachtet

In [1]:
from flair.data import Sentence
from flair.models import SequenceTagger

tagger = SequenceTagger.load("flair/ner-german-legal")

2025-07-27 19:58:27,547 SequenceTagger predicts: Dictionary with 78 tags: <unk>, O, S-AN, B-RS, I-RS, E-RS, B-GS, I-GS, E-GS, B-GRT, I-GRT, E-GRT, S-GRT, B-LIT, I-LIT, E-LIT, B-EUN, I-EUN, E-EUN, B-LD, E-LD, S-RR, B-VT, I-VT, E-VT, B-ORG, I-ORG, E-ORG, B-RR, E-RR, S-GS, B-INN, E-INN, S-INN, S-VT, B-VS, I-VS, E-VS, S-ST, S-LD, S-ORG, B-VO, I-VO, E-VO, S-VS, B-UN, E-UN, S-VO, S-EUN, I-INN


In [2]:
import json
with open("../../data/data_annotated.json", encoding="utf-8") as f:
    all_data = json.load(f)

In [4]:
def extract_flair_entities(text):
    sentence = Sentence(text)
    tagger.predict(sentence)
    predicted = []

    for entity in sentence.get_spans("ner"):
        predicted.append({
            "text": entity.text,
            "start": entity.start_position,
            "end": entity.end_position,
            "label": entity.get_label("ner").value
        })
    return predicted

In [5]:
# Iteration auf dem gesamten Datensatz, Berechnung der Metriken, Speichern der Ergebnisse im Json
from collections import Counter, defaultdict
import json

# Funktion zur erstellung des dictionary
def span_to_dict(span, text):
    return {
        "text": text[span[0]:span[1]],
        "start": span[0],
        "end": span[1],
        "label": span[2]
    }


tp, fp, fn = 0, 0, 0
label_stats = defaultdict(lambda: [0, 0, 0])  # TP, FP, FN pro Label
relevant_labels = {"EVENT", "TOPIC", "TIME", "DATE", "LOC"}
label_mapping = {"ST": "LOC", "STR": "LOC"}
all_results = []

for eintrag in all_data:
    file_name = eintrag.get("file_name", None)
    text = eintrag["text"]
    gold = eintrag.get("entities", [])
    predicted = extract_flair_entities(text)

    # Set aus (start, end, label)
    gold_spans = {(e["start"], e["end"], e["label"]) for e in gold if e["label"] in relevant_labels}
    #pred_spans = {(e["start"], e["end"], e["label"]) for e in predicted if e["label"] in relevant_labels}

     # Mapping anwenden
    pred_spans = {
        (e["start"], e["end"], label_mapping.get(e["label"], e["label"]))
        for e in predicted
        if label_mapping.get(e["label"], e["label"]) in relevant_labels
    }

    # Gesamtmetriken
    tp += len(gold_spans & pred_spans)
    fp += len(pred_spans - gold_spans)
    fn += len(gold_spans - pred_spans)

    # für Berechnung pro Eintrag
    tp_spans = gold_spans & pred_spans
    fp_spans = pred_spans - gold_spans
    fn_spans = gold_spans - pred_spans
    
    tp_count = len(tp_spans)
    fp_count = len(fp_spans)
    fn_count = len(fn_spans)
    
    # lokale Metriken für dieses Dokument berechnen
    precision_local = tp_count / (tp_count + fp_count) if (tp_count + fp_count) > 0 else 0
    recall_local = tp_count / (tp_count + fn_count) if (tp_count + fn_count) > 0 else 0
    f1_local = 2 * precision_local * recall_local / (precision_local + recall_local) if (precision_local + recall_local) > 0 else 0
    

    # Pro Label
    for label in set([e["label"] for e in gold + predicted]):
        if label not in relevant_labels:
            continue 
    
        g = {s for s in gold_spans if s[2] == label}
        p = {s for s in pred_spans if s[2] == label}
        label_stats[label][0] += len(g & p)      # TP
        label_stats[label][1] += len(p - g)      # FP
        label_stats[label][2] += len(g - p)      # FN

    result = {
    "file_name": file_name,
    "text": text,
    "precision": precision_local,
    "recall": recall_local,
    "f1": f1_local,
    "true_positives": [span_to_dict(s, text) for s in tp_spans],
    "false_positives": [span_to_dict(s, text) for s in fp_spans],
    "false_negatives": [span_to_dict(s, text) for s in fn_spans],

    }


    all_results.append(result)

# Speichern der results / Ergebnis pro Eintrag
with open("../../data/NER/flair/results_ner_german_legal.json", "w", encoding="utf-8") as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2)


# -------------------------
# 5. Gesamtergebnisse
# -------------------------

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print("\n=== Gesamtbewertung ===")
print(f"Precision: {precision:.2f}")
print(f"Recall   : {recall:.2f}")
print(f"F1-Score : {f1:.2f}")

# -------------------------
# 6. Bewertung pro Label
# -------------------------

print("\n=== Bewertung pro Label ===")
for label, (tp_l, fp_l, fn_l) in label_stats.items():
    p = tp_l / (tp_l + fp_l) if (tp_l + fp_l) > 0 else 0
    r = tp_l / (tp_l + fn_l) if (tp_l + fn_l) > 0 else 0
    f = 2 * p * r / (p + r) if (p + r) > 0 else 0
    print(f"{label:<10} P: {p:.2f}  R: {r:.2f}  F1: {f:.2f}")


=== Gesamtbewertung ===
Precision: 0.45
Recall   : 0.07
F1-Score : 0.12

=== Bewertung pro Label ===
TIME       P: 0.00  R: 0.00  F1: 0.00
DATE       P: 0.00  R: 0.00  F1: 0.00
LOC        P: 0.46  R: 0.23  F1: 0.30
EVENT      P: 0.00  R: 0.00  F1: 0.00
TOPIC      P: 0.00  R: 0.00  F1: 0.00


---
### Zählung von True Positives, Overlap Matches, Fuzzy Matches

Overlap Machtes: text und label matchen mit Goldstandard  
Fuzzy Matches: Text und Label matchen mit Goldstandard und Position ist auf maximal 2 Stellen Abweichnung korrekt


In [7]:
from collections import defaultdict
import json

with open("../../data/NER/flair/results_ner_german_legal.json", encoding="utf-8") as f:
    pred_data = json.load(f)

with open("../../data/data_annotated.json", encoding="utf-8") as f:
    gold_data = json.load(f)

def span_text(span, text):
    return text[span[0]:span[1]]

def fuzzy_match(gold_span, pred_span, tolerance=2):
    return (
        gold_span[2] == pred_span[2] and
        abs(gold_span[0] - pred_span[0]) <= tolerance and
        abs(gold_span[1] - pred_span[1]) <= tolerance
    )

# Index predicted nach file_name
pred_index = {entry["file_name"]: entry for entry in pred_data}


tp = 0
overlap_matches = 0  # nur text und label sind gleich
fuzzy_matches = 0    # text und label sind gleich und position weicht um maximal 2 Zeichen ab

for eintrag in gold_data:
    file_name = eintrag.get("file_name")
    text = eintrag["text"]
    gold = [e for e in eintrag.get("entities", []) if e["label"] == "LOC"]

    pred_entry = pred_index.get(file_name, {})
    # Kombiniere TP und FP, FN interessieren hier nicht
    predicted = pred_entry.get("true_positives", []) + pred_entry.get("false_positives", [])
    predicted = [e for e in predicted if e["label"] == "LOC"]

    # Erstelle Sets für TP-Zählung
    gold_spans = {(e["start"], e["end"], e["label"]) for e in gold}
    pred_spans = {(e["start"], e["end"], e["label"]) for e in predicted}

    tp += len(gold_spans & pred_spans)

    gold_spans_list = list(gold_spans)
    pred_spans_list = list(pred_spans)

    # Overlap: label und Text gleich
    matched_pred_indices = set()
    for g in gold_spans:
        for i, p in enumerate(pred_spans_list):
            if i in matched_pred_indices:
                continue
            if g[2] == p[2] and span_text(g, text) == span_text(p, text):
                overlap_matches += 1
                matched_pred_indices.add(i)
                break

    # Fuzzy Match: ±2 Zeichen Toleranz bei Start/Ende
    matched_pred_indices_fuzzy = set()
    for g in gold_spans_list:
        for i, p in enumerate(pred_spans_list):
            if i in matched_pred_indices_fuzzy:
                continue
            if fuzzy_match(g, p):
                fuzzy_matches += 1
                matched_pred_indices_fuzzy.add(i)
                break

print(f"\n=== LOC-Ergebnisse ===")
print(f"True Positives (genau): {tp}")
print(f"Overlap Matches (Text & Label gleich): {overlap_matches}")
print(f"Fuzzy Matches (±2 Zeichen): {fuzzy_matches}")
print(f"Fuzzy Matches ohne Overlaps: {fuzzy_matches - overlap_matches}")


=== LOC-Ergebnisse ===
True Positives (genau): 94
Overlap Matches (Text & Label gleich): 98
Fuzzy Matches (±2 Zeichen): 99
Fuzzy Matches ohne Overlaps: 1


--> deutlich weniger true posiives als beim Modell ner_german_large  

nur 5 Locs haben nicht exakt die gleichen Start- und Endposition wie im Goldstandard