In [1]:
""" install packages for spotting (=detection + recognition) evaluation """
!pip install -q shapely Polygon3 nltk 

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
""" preprocess json files for gt and multiple preds """
import os
import json
import re
from tqdm import tqdm
from nltk.metrics.distance import edit_distance

# gt 
gt_input_path = "../outputs/Inference_OCR/MangaLMM.jsonl"
gt_dir = "page_gt"
os.makedirs(gt_dir, exist_ok=True)

# jsonl for pred 
pred_input_paths = [
    "../outputs/Inference_OCR/MangaLMM.jsonl",
]

def normalize_repeated_symbols(text):
    text = re.sub(r'([~\～\〜\-\ー]+)', lambda m: m.group(1)[0], text) 
    text = re.sub(r'[~～〜]', '~', text)
    text = re.sub(r'[-ー]', '-', text)
    return text

def normalize_punctuation(text):
    conversion_map = {
        "！": "!",  
        "？": "?",  
        "…": "..." 
    }
    text = re.sub("|".join(map(re.escape, conversion_map.keys())), lambda m: conversion_map[m.group()], text)
    text = re.sub(r'[・･.]', '・', text) 
    return text

# save function
def save_lines(filename, items):
    with open(filename, 'w', encoding='utf-8') as f:
        for text, coords in items:
            text = re.sub(r"[\s\u3000]+", "", text) 
            text = normalize_repeated_symbols(text) 
            text = normalize_punctuation(text) 
            
            line = ",".join(map(str, coords)) + f",####{text}"
            f.write(line + "\n")

# -----------------------
# GT
# -----------------------
gt_done = len(os.listdir(gt_dir)) > 0  

if not gt_done:
    with open(gt_input_path, "r", encoding="utf-8") as f:
        for idx, line in tqdm(enumerate(f), desc="Saving GT"):
            data = json.loads(line)
            output_id = f"{idx+1:07d}"
            gt_file_path = os.path.join(gt_dir, f"{output_id}.txt")
            if "gt" in data:
                save_lines(gt_file_path, data["gt"])

# -----------------------
# Pred
# -----------------------
for pred_input_path in pred_input_paths:
    model_name = os.path.splitext(os.path.basename(pred_input_path))[0]
    pred_dir = os.path.join("page_prediction", model_name)
    os.makedirs(pred_dir, exist_ok=True)

    with open(pred_input_path, "r", encoding="utf-8") as f:
        for idx, line in tqdm(enumerate(f), desc=f"Saving pred for {model_name}"):
            data = json.loads(line)
            output_id = f"{idx+1:07d}"
            pred_file_path = os.path.join(pred_dir, f"{output_id}.txt")
            if "pred" in data:
                save_lines(pred_file_path, data["pred"])

Saving GT: 1166it [00:02, 565.15it/s]
Saving pred for MangaLMM: 1166it [00:06, 176.27it/s]


In [3]:
import os
import sys
import json
import zipfile
import glob
from collections import Counter

from shapely.geometry import Polygon, LinearRing
from tqdm import tqdm

sys.path.append(os.getcwd())
import text_eval_script
import rrc_evaluation_funcs

def preprocess_and_zip(origin_file, output_file, zip_path):
    files = glob.glob(os.path.join(origin_file, "*.txt"))
    files.sort()

    count_all = 0
    count_all_but_rrcvalid = 0
    count_exception = 0 
    for idx, file in enumerate(files):
        out = file.replace(origin_file, output_file)
        fin = open(file, "r", encoding="utf8").readlines()
        fout = open(out, "w", encoding="utf8")
    
        for iline, line in enumerate(fin):
            if ",####" not in line:
                continue

            count_all += 1
    
            ptr = line.strip().split(",####")
            rec = ptr[1]
            cors = ptr[0].split(",")
            if len(cors) % 2 != 0:
                continue
            
            try:
                pts = [(int(cors[j]), int(cors[j + 1])) for j in range(0, len(cors), 2)]
            except:
                continue
    
            try:
                pgt = Polygon(pts)
            except Exception as e:
                # print("An invalid detection in {} line {} is removed ... ".format(i, iline))
                continue
    
            if not pgt.is_valid:
                # print("An invalid detection in {} line {} is removed ... ".format(i, iline))
                continue
    
            pRing = LinearRing(pts)
            if pRing.is_ccw:  
                pts.reverse() # left bottom, right bottom, right top, left top の順番に変える
            outstr = ""
            for ipt in pts[:-1]:
                outstr += str(int(ipt[0])) + "," + str(int(ipt[1])) + ","
            outstr += str(int(pts[-1][0])) + "," + str(int(pts[-1][1]))
            outstr = outstr + ",####" + rec
            
            count_all_but_rrcvalid += 1
            # to handle Exception: b"Line in sample not valid. ..."
            try:
                rrc_evaluation_funcs.validate_tl_line(
                    # outstr, LTRB, withTranscription, withConfidence, imWidth, imHeight
                    outstr,
                    False,
                    True,
                    False,
                    0,
                    0,
                )
            except:
                count_exception += 1
                continue
            fout.writelines(outstr + "\n")
        fout.close()

        # remove duplicates (= the word that appears over 10 times!) 
        if "gt" in origin_file: # except gt file
            output_zip_folder = output_file
            continue
        else:
            output_zip_folder = "final_" + output_file
            os.makedirs(output_zip_folder, exist_ok=True)
            deduplicate = out.replace(output_file, output_zip_folder)
            
            with open(out, 'r', encoding='utf-8') as f:
                lines = [line.strip() for line in f if '####' in line]
            
            # extract words 
            words = [line.split('####')[-1] for line in lines]
            
            # count words
            word_counts = Counter(words)
            
            # filter lines
            filtered_lines = [
                line for line in lines 
                if word_counts[line.split('####')[-1]] < 10
            ]
            
            # save
            with open(deduplicate, 'w', encoding='utf-8') as f:
                for line in filtered_lines:
                    f.write(line + '\n')

    
    print(f"count_all:{count_all}\tcount_all_but_rrcvalid:{count_all_but_rrcvalid}\tcount_exception:{count_exception}")
    # create zip file for eval
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for filename in os.listdir(output_zip_folder):
            file_path = os.path.join(output_zip_folder, filename)
            # no root directory
            zipf.write(file_path, arcname=filename)

In [4]:
origin_file = "page_gt/"
output_file = "final_" + origin_file
os.makedirs(output_file, exist_ok=True)
zip_path = "./page_gt.zip"
preprocess_and_zip(origin_file, output_file, zip_path)

count_all:25651	count_all_but_rrcvalid:25651	count_exception:0


In [5]:
import os

base_dir = "page_prediction"
tmp_base_dir = "page_prediction_tmp"
final_zip_dir = "final_zip"
os.makedirs(tmp_base_dir, exist_ok=True)
os.makedirs(final_zip_dir, exist_ok=True)

for folder_name in sorted(os.listdir(base_dir)):
    origin_file = os.path.join(base_dir, folder_name)
    if os.path.isdir(origin_file):
        output_file = os.path.join(tmp_base_dir, folder_name)
        os.makedirs(output_file, exist_ok=True)

        zip_path = f"{final_zip_dir}/{folder_name}.zip"

        preprocess_and_zip(origin_file, output_file, zip_path)
        print(f"Zipped: {zip_path}")

count_all:24832	count_all_but_rrcvalid:24832	count_exception:0
Zipped: final_zip/MangaLMM.zip


In [6]:
import os
import sys
sys.path.append(os.getcwd())
import text_eval_script
import rrc_evaluation_funcs

# One GT
gt_path = "page_gt.zip"

# for multiple pred zip files
pred_dir = "final_zip"
for zip_file in sorted(os.listdir(pred_dir)):
    if zip_file.endswith(".zip"):
        result_path = os.path.join(pred_dir, zip_file)
        print(f"Evaluating: {zip_file}")

        _ = text_eval_script.text_eval_main(
            det_file=result_path,
            gt_file=gt_path,
            is_word_spotting=False,
            is_NED=True
        )

Evaluating: MangaLMM.zip
Calculated!
"DET_PRH:82.2,75.3,78.6"
"E2E_PRH:74.8,68.5,71.5"
