#### es soll überprüft werden, ob hohe Confidence Werte von EasyOCR tatsächlich richtig erkannte Wörter ausgeben

#### auf komplettem Text prüfen

In [2]:
import easyocr
import json
import os
from difflib import SequenceMatcher
import pandas as pd

IMG_DIR = "../../data/images/insta_images"
OUTPUT_PATH = "../../data/OCR/ocr_evaluation_confidence_test.csv"
JSON_PATH = "../../data/original_text.json"


# OCR-Reader initialisieren (z.B. Deutsch)
reader = easyocr.Reader(['de'], gpu=False)

# Ground Truth laden
with open(JSON_PATH, 'r', encoding='utf-8') as f:
    gt_data = json.load(f)

results = []

for entry in gt_data:
    filename = entry['file_name']
    gt_text = entry['text'].strip().replace('\n', ' ')
    image_path = os.path.join(IMG_DIR, filename)

    if not os.path.exists(image_path):
        print(f"Bild nicht gefunden: {filename}")
        continue

    # OCR ausführen
    ocr_result = reader.readtext(image_path)

    # Falls nichts erkannt wurde, überspringen
    if not ocr_result:
        continue

    # Erkannten Text zusammensetzen und mittlere Confidence berechnen
    ocr_text = ' '.join([item[1].strip() for item in ocr_result])
    confidences = [item[2] for item in ocr_result]
    avg_conf = sum(confidences) / len(confidences) if confidences else 0

    # Ähnlichkeit (Ground Truth vs OCR)
    similarity = SequenceMatcher(None, gt_text, ocr_text).ratio()

    results.append({
        "filename": filename,
        "ground_truth": gt_text,
        "ocr_text": ocr_text,
        "avg_confidence": avg_conf,
        "similarity": similarity
    })

# Ergebnisse als DataFrame
df = pd.DataFrame(results)
df.to_csv("ocr_confidence_vs_accuracy.csv", index=False)

# Korrelation ausgeben
correlation = df['avg_confidence'].corr(df['similarity'])
print(f"Korrelation zwischen OCR-Confidence und Ähnlichkeit zum Ground Truth: {correlation:.3f}")

Using CPU. Note: This module is much faster with a GPU.


Korrelation zwischen OCR-Confidence und Ähnlichkeit zum Ground Truth: 0.342


#### --> schlechtes Ergebnis

---

#### auf Zeilenebene prüfen

In [3]:
import easyocr
import json
import os
from difflib import SequenceMatcher
import pandas as pd

IMG_DIR = "../../data/images/insta_images"
OUTPUT_PATH = "../../data/OCR/ocr_evaluation_confidence_test2.csv"
JSON_PATH = "../../data/original_text.json"

# OCR-Reader initialisieren
reader = easyocr.Reader(['de'], gpu=False)

# Ground Truth laden
with open(JSON_PATH, 'r', encoding='utf-8') as f:
    gt_data = json.load(f)

# Ergebnisse sammeln
line_results = []
doc_results = []

for entry in gt_data:
    filename = entry['file_name']
    gt_text = entry['text'].strip().replace('\n', ' ')
    gt_lines = [line.strip() for line in entry['lines']]
    image_path = os.path.join(IMG_DIR, filename)

    if not os.path.exists(image_path):
        print(f"Bild nicht gefunden: {filename}")
        continue

    # OCR ausführen
    ocr_result = reader.readtext(image_path)

    if not ocr_result:
        continue

    # Dokument-Ebene
    ocr_text = ' '.join([item[1].strip() for item in ocr_result])
    confidences = [item[2] for item in ocr_result]
    avg_conf = sum(confidences) / len(confidences) if confidences else 0
    similarity = SequenceMatcher(None, gt_text, ocr_text).ratio()

    doc_results.append({
        "filename": filename,
        "ground_truth": gt_text,
        "ocr_text": ocr_text,
        "avg_confidence": avg_conf,
        "similarity": similarity
    })

    # Zeilen-Ebene
    ocr_lines = [item[1].strip() for item in ocr_result]
    ocr_confidences = [item[2] for item in ocr_result]

    for gt_line in gt_lines:
        gt_line = gt_line.strip()
        best_match = None
        best_similarity = 0
        best_conf = 0

        for ocr_text, conf in zip(ocr_lines, ocr_confidences):
            sim = SequenceMatcher(None, gt_line, ocr_text).ratio()
            if sim > best_similarity:
                best_similarity = sim
                best_match = ocr_text
                best_conf = conf

        line_results.append({
            "filename": filename,
            "gt_line": gt_line,
            "ocr_match": best_match,
            "line_similarity": best_similarity,
            "line_confidence": best_conf
        })

# Daten speichern / analysieren
df_doc = pd.DataFrame(doc_results)
df_lines = pd.DataFrame(line_results)

#df_doc.to_csv("ocr_document_level.csv", index=False)
#df_lines.to_csv("ocr_line_level.csv", index=False)

# Korrelationen berechnen
print("\nDokument-Level:")
print(f"Korrelation (Confidence vs. Ähnlichkeit): {df_doc['avg_confidence'].corr(df_doc['similarity']):.3f}")

print("\nZeilen-Level:")
print(f"Korrelation (Confidence vs. Ähnlichkeit): {df_lines['line_confidence'].corr(df_lines['line_similarity']):.3f}")


Using CPU. Note: This module is much faster with a GPU.



Dokument-Level:
Korrelation (Confidence vs. Ähnlichkeit): 0.342

Zeilen-Level:
Korrelation (Confidence vs. Ähnlichkeit): 0.387
