# Flair Evaluation - Optimierung

Es wird untersucht wie gut die Entitäten DATE, TIME, EVENT im Datensatz erkannt werden.  
Der regelbasierte Ansatz wird zum einen seperat und zum anderen zusammen mit NER durch Flair (nur LOC) evaluiert.


---
#### 1. Evaluation DATE, TIME, EVENT 
    1.1 alle Entitäten einbeziehen
    1.2 ohne Einbeziehung von TOPIC und LOC
#### 2. Evaluation Kombination von Regeln und flair
    2.1 alle Entitäten einbeziehen
    2.2 ohne Einbeziehung von TOPIC
---

### 1. Evaluation DATE, TIME, EVENT 
#### 1.1 alle Entitäten einbeziehen

In [1]:
import json
import pandas as pd
from collections import Counter, defaultdict

GOLD_PATH = "../../data/data_annotated.json"
PRED_PATH = "../../data/NER/flair/ner_regex_results.json"
OUTPUT_PATH = "../../data/NER/flair/ner_optimierung_regex_evaluation.csv"

tp, fp, fn = 0, 0, 0
label_stats = defaultdict(lambda: [0, 0, 0])  # TP, FP, FN pro Label
file_scores = []

with open(GOLD_PATH, "r", encoding="utf-8") as f:
    gt_data = json.load(f)

with open(PRED_PATH, "r", encoding="utf-8") as f:
    pred_data = json.load(f)

for gt_entry in gt_data:
    file_name = gt_entry["file_name"]
    gold = gt_entry.get("entities", [])

    pred = pred_data.get(file_name)

    # Set aus (start, end, label)
    gold_spans = {(e["start"], e["end"], e["label"]) for e in gold} 
    pred_spans = {(e["start"], e["end"], e["label"]) for e in pred}
    
    # TP/FP/FN für Gesamtauswertung
    tp_file = len(gold_spans & pred_spans)
    fp_file = len(pred_spans - gold_spans)
    fn_file = len(gold_spans - pred_spans)

    tp += tp_file
    fp += fp_file
    fn += fn_file

    # Pro Datei speichern
    precision_file = tp_file / (tp_file + fp_file) if (tp_file + fp_file) > 0 else 0
    recall_file = tp_file / (tp_file + fn_file) if (tp_file + fn_file) > 0 else 0
    f1_file = 2 * precision_file * recall_file / (precision_file + recall_file) if (precision_file + recall_file) > 0 else 0

    file_scores.append({
        "file_name": file_name,
        "precision": precision_file,
        "recall": recall_file,
        "f1_score": f1_file,
        "tp": tp_file,
        "fp": fp_file,
        "fn": fn_file,
    })

    # Pro Label
    for label in set([e["label"] for e in gold + pred]):
        g = {s for s in gold_spans if s[2] == label}
        p = {s for s in pred_spans if s[2] == label}
        label_stats[label][0] += len(g & p)      # TP
        label_stats[label][1] += len(p - g)      # FP
        label_stats[label][2] += len(g - p)      # FN


# -------------------------
# Gesamtergebnisse
# -------------------------

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print("\n=== Gesamtbewertung ===")
print(f"Precision: {precision:.2f}")
print(f"Recall   : {recall:.2f}")
print(f"F1-Score : {f1:.2f}")

# -------------------------
# Bewertung pro Label
# -------------------------

print("\n=== Bewertung pro Label ===")
for label, (tp_l, fp_l, fn_l) in label_stats.items():
    p = tp_l / (tp_l + fp_l) if (tp_l + fp_l) > 0 else 0
    r = tp_l / (tp_l + fn_l) if (tp_l + fn_l) > 0 else 0
    f = 2 * p * r / (p + r) if (p + r) > 0 else 0
    print(f"{label:<10} P: {p:.2f}  R: {r:.2f}  F1: {f:.2f}")


##############################

df = pd.DataFrame(file_scores)
df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8")





=== Gesamtbewertung ===
Precision: 0.68
Recall   : 0.34
F1-Score : 0.45

=== Bewertung pro Label ===
DATE       P: 0.58  R: 0.64  F1: 0.61
EVENT      P: 0.73  R: 0.55  F1: 0.62
LOC        P: 0.00  R: 0.00  F1: 0.00
TIME       P: 0.80  R: 0.76  F1: 0.78
TOPIC      P: 0.00  R: 0.00  F1: 0.00


#### 1.2 Betrachtung ohne TOPIC und LOC

In [2]:
import json
import pandas as pd
from collections import Counter, defaultdict

GOLD_PATH = "../../data/data_annotated.json"
PRED_PATH = "../../data/NER/flair/ner_regex_results.json"
OUTPUT_PATH = "../../data/NER/flair/ner_optimierung_regex_evaluation_without_topic.csv"

tp, fp, fn = 0, 0, 0
label_stats = defaultdict(lambda: [0, 0, 0])  # TP, FP, FN pro Label
relevant_labels = {"TIME", "DATE", "EVENT"}
file_scores = [] 

with open(GOLD_PATH, "r", encoding="utf-8") as f:
    gt_data = json.load(f)

with open(PRED_PATH, "r", encoding="utf-8") as f:
    pred_data = json.load(f)

for gt_entry in gt_data:
    file_name = gt_entry["file_name"]
    gold = gt_entry.get("entities", [])

    pred = pred_data.get(file_name)

    # Set aus (start, end, label)
    gold_spans = {(e["start"], e["end"], e["label"]) for e in gold if e["label"] in relevant_labels}
    pred_spans = {(e["start"], e["end"], e["label"]) for e in pred if e["label"] in relevant_labels}
    
    # TP/FP/FN für Gesamtauswertung
    tp_file = len(gold_spans & pred_spans)
    fp_file = len(pred_spans - gold_spans)
    fn_file = len(gold_spans - pred_spans)

    tp += tp_file
    fp += fp_file
    fn += fn_file

    # Pro Datei speichern
    precision_file = tp_file / (tp_file + fp_file) if (tp_file + fp_file) > 0 else 0
    recall_file = tp_file / (tp_file + fn_file) if (tp_file + fn_file) > 0 else 0
    f1_file = 2 * precision_file * recall_file / (precision_file + recall_file) if (precision_file + recall_file) > 0 else 0

    file_scores.append({
        "file_name": file_name,
        "precision": precision_file,
        "recall": recall_file,
        "f1_score": f1_file,
        "tp": tp_file,
        "fp": fp_file,
        "fn": fn_file,
    })

    # Pro Label
    for label in set([e["label"] for e in gold + pred]):
        if label not in relevant_labels:
            continue 
        g = {s for s in gold_spans if s[2] == label}
        p = {s for s in pred_spans if s[2] == label}
        label_stats[label][0] += len(g & p)      # TP
        label_stats[label][1] += len(p - g)      # FP
        label_stats[label][2] += len(g - p)      # FN


# -------------------------
# Gesamtergebnisse
# -------------------------

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print("\n=== Gesamtbewertung ===")
print(f"Precision: {precision:.2f}")
print(f"Recall   : {recall:.2f}")
print(f"F1-Score : {f1:.2f}")

# -------------------------
# Bewertung pro Label
# -------------------------

print("\n=== Bewertung pro Label ===")
for label, (tp_l, fp_l, fn_l) in label_stats.items():
    p = tp_l / (tp_l + fp_l) if (tp_l + fp_l) > 0 else 0
    r = tp_l / (tp_l + fn_l) if (tp_l + fn_l) > 0 else 0
    f = 2 * p * r / (p + r) if (p + r) > 0 else 0
    print(f"{label:<10} P: {p:.2f}  R: {r:.2f}  F1: {f:.2f}")



##############################

df = pd.DataFrame(file_scores)
df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8")



=== Gesamtbewertung ===
Precision: 0.68
Recall   : 0.64
F1-Score : 0.66

=== Bewertung pro Label ===
DATE       P: 0.58  R: 0.64  F1: 0.61
EVENT      P: 0.73  R: 0.55  F1: 0.62
TIME       P: 0.80  R: 0.76  F1: 0.78


---

### 2. Evaluation Kombination von Regeln und Flair
#### 2.1 alle Entitäten einbeziehen

In [3]:
import json
import pandas as pd

GOLD_PATH = "../../data/data_annotated.json"
PRED_PATH = "../../data/NER/flair/ner_regex_with_flair_results.json"
OUTPUT_PATH = "../../data/NER/flair/ner_optimierung_regex_flair_evaluation.csv"

tp, fp, fn = 0, 0, 0
label_stats = defaultdict(lambda: [0, 0, 0])  # TP, FP, FN pro Label
file_scores = [] 


with open(GOLD_PATH, "r", encoding="utf-8") as f:
    gt_data = json.load(f)

with open(PRED_PATH, "r", encoding="utf-8") as f:
    pred_data = json.load(f)

for gt_entry in gt_data:
    file_name = gt_entry["file_name"]
    gold = gt_entry.get("entities", [])

    pred = pred_data.get(file_name)

    # Set aus (start, end, label)
    gold_spans = {(e["start"], e["end"], e["label"]) for e in gold} 
    pred_spans = {(e["start"], e["end"], e["label"]) for e in pred}
    
    
   # TP/FP/FN für Gesamtauswertung
    tp_file = len(gold_spans & pred_spans)
    fp_file = len(pred_spans - gold_spans)
    fn_file = len(gold_spans - pred_spans)

    tp += tp_file
    fp += fp_file
    fn += fn_file

    # Pro Datei speichern
    precision_file = tp_file / (tp_file + fp_file) if (tp_file + fp_file) > 0 else 0
    recall_file = tp_file / (tp_file + fn_file) if (tp_file + fn_file) > 0 else 0
    f1_file = 2 * precision_file * recall_file / (precision_file + recall_file) if (precision_file + recall_file) > 0 else 0

    file_scores.append({
        "file_name": file_name,
        "precision": precision_file,
        "recall": recall_file,
        "f1_score": f1_file,
        "tp": tp_file,
        "fp": fp_file,
        "fn": fn_file,
    })

    # Pro Label
    for label in set([e["label"] for e in gold + pred]):
        g = {s for s in gold_spans if s[2] == label}
        p = {s for s in pred_spans if s[2] == label}
        label_stats[label][0] += len(g & p)      # TP
        label_stats[label][1] += len(p - g)      # FP
        label_stats[label][2] += len(g - p)      # FN


# -------------------------
# Gesamtergebnisse
# -------------------------

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print("\n=== Gesamtbewertung ===")
print(f"Precision: {precision:.2f}")
print(f"Recall   : {recall:.2f}")
print(f"F1-Score : {f1:.2f}")

# -------------------------
# Bewertung pro Label
# -------------------------

print("\n=== Bewertung pro Label ===")
for label, (tp_l, fp_l, fn_l) in label_stats.items():
    p = tp_l / (tp_l + fp_l) if (tp_l + fp_l) > 0 else 0
    r = tp_l / (tp_l + fn_l) if (tp_l + fn_l) > 0 else 0
    f = 2 * p * r / (p + r) if (p + r) > 0 else 0
    print(f"{label:<10} P: {p:.2f}  R: {r:.2f}  F1: {f:.2f}")
    


##############################

df = pd.DataFrame(file_scores)
df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8")


=== Gesamtbewertung ===
Precision: 0.57
Recall   : 0.47
F1-Score : 0.52

=== Bewertung pro Label ===
DATE       P: 0.58  R: 0.64  F1: 0.61
EVENT      P: 0.73  R: 0.55  F1: 0.62
LOC        P: 0.40  R: 0.45  F1: 0.42
TIME       P: 0.80  R: 0.76  F1: 0.78
TOPIC      P: 0.00  R: 0.00  F1: 0.00


#### 2.2 Betrachtung ohne TOPIC

In [4]:
import json
import pandas as pd

GOLD_PATH = "../../data/data_annotated.json"
PRED_PATH = "../../data/NER/flair/ner_regex_with_flair_results.json"
OUTPUT_PATH = "../../data/NER/flair/ner_optimierung_regex_flair_evaluation_without_topic.csv"

tp, fp, fn = 0, 0, 0
label_stats = defaultdict(lambda: [0, 0, 0])  # TP, FP, FN pro Label
relevant_labels = {"TIME", "DATE", "EVENT","LOC"}
file_scores = [] 

with open(GOLD_PATH, "r", encoding="utf-8") as f:
    gt_data = json.load(f)

with open(PRED_PATH, "r", encoding="utf-8") as f:
    pred_data = json.load(f)

for gt_entry in gt_data:
    file_name = gt_entry["file_name"]
    gold = gt_entry.get("entities", [])

    pred = pred_data.get(file_name)

    # Set aus (start, end, label)
    gold_spans = {(e["start"], e["end"], e["label"]) for e in gold if e["label"] in relevant_labels}
    pred_spans = {(e["start"], e["end"], e["label"]) for e in pred if e["label"] in relevant_labels}
    
    # TP/FP/FN für Gesamtauswertung
    tp_file = len(gold_spans & pred_spans)
    fp_file = len(pred_spans - gold_spans)
    fn_file = len(gold_spans - pred_spans)

    tp += tp_file
    fp += fp_file
    fn += fn_file

    # Pro Datei speichern
    precision_file = tp_file / (tp_file + fp_file) if (tp_file + fp_file) > 0 else 0
    recall_file = tp_file / (tp_file + fn_file) if (tp_file + fn_file) > 0 else 0
    f1_file = 2 * precision_file * recall_file / (precision_file + recall_file) if (precision_file + recall_file) > 0 else 0

    file_scores.append({
        "file_name": file_name,
        "precision": precision_file,
        "recall": recall_file,
        "f1_score": f1_file,
        "tp": tp_file,
        "fp": fp_file,
        "fn": fn_file,
    })

    # Pro Label
    for label in set([e["label"] for e in gold + pred]):
        if label not in relevant_labels:
            continue 
        g = {s for s in gold_spans if s[2] == label}
        p = {s for s in pred_spans if s[2] == label}
        label_stats[label][0] += len(g & p)      # TP
        label_stats[label][1] += len(p - g)      # FP
        label_stats[label][2] += len(g - p)      # FN


# -------------------------
# Gesamtergebnisse
# -------------------------

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print("\n=== Gesamtbewertung ===")
print(f"Precision: {precision:.2f}")
print(f"Recall   : {recall:.2f}")
print(f"F1-Score : {f1:.2f}")

# -------------------------
# Bewertung pro Label
# -------------------------

print("\n=== Bewertung pro Label ===")
for label, (tp_l, fp_l, fn_l) in label_stats.items():
    p = tp_l / (tp_l + fp_l) if (tp_l + fp_l) > 0 else 0
    r = tp_l / (tp_l + fn_l) if (tp_l + fn_l) > 0 else 0
    f = 2 * p * r / (p + r) if (p + r) > 0 else 0
    print(f"{label:<10} P: {p:.2f}  R: {r:.2f}  F1: {f:.2f}")
    


##############################

df = pd.DataFrame(file_scores)
df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8")


=== Gesamtbewertung ===
Precision: 0.57
Recall   : 0.57
F1-Score : 0.57

=== Bewertung pro Label ===
DATE       P: 0.58  R: 0.64  F1: 0.61
EVENT      P: 0.73  R: 0.55  F1: 0.62
LOC        P: 0.40  R: 0.45  F1: 0.42
TIME       P: 0.80  R: 0.76  F1: 0.78
