In [1]:
import os
import pandas as pd
import numpy as np
import json


ocr_types_dict = {
    'end2end': 'end2end'
}

official_names = list(ocr_types_dict.keys())
ocr_types = list(ocr_types_dict.values())

result_folder = '../result'

match_name = 'quick_match'


```
┌── Block 1 = global multi-task score table + global "overall" score.
│
├── Block 2 = diagnosis by document type (difficulty profile in text Edit_dist).
│
├── Block 3 = diagnosis by visual page degradations (fuzzy scan, watermark, background) and their impact on Edit_dist (mean + var).
│
├── Block 4 = reading order by page layout (difficulty profile in Edit_dist for single/double/three/other column — mean + var).
│
├── Block 5 = diagnosis by text attributes (text language & background color) and their impact on Edit_dist.
│
├── Block 6 = diagnosis by table attributes (language, line style, spans, equations, background, orientation) and impact on TEDS score (%).
│
├── Block 7 = comparison of text OCR models (text extraction performance by language, background and rotation, via Edit_dist).
│
└── Block 8 = table recognition (impact of language, line style, spans, equations, background and orientation on TEDS score).
```

In [2]:
# Block 1 - Overall score (multi-task + global "overall").

dict_list = []

for ocr_type in ocr_types:

    result_path = os.path.join(result_folder, f'{ocr_type}_{match_name}_metric_result.json')

    with open(result_path, 'r') as f:
        result = json.load(f)
    
    save_dict = {}

    for category_type, metric in [
        ('text_block', 'Edit_dist'),
        ('display_formula', 'CDM'),
        ('table', 'TEDS'),
        ('table', 'TEDS_structure_only'),
        ('reading_order', 'Edit_dist')
    ]:
        if metric == 'CDM' or metric == 'TEDS' or metric == 'TEDS_structure_only':
            if result[category_type]['page'].get(metric):
                save_dict[category_type+'_'+metric] = result[category_type]['page'][metric]['ALL'] * 100   # page级别的avg
            else:
                save_dict[category_type+'_'+metric] = 0
        else:
            save_dict[category_type+'_'+metric] = result[category_type]['all'][metric].get('ALL_page_avg', np.nan)

    dict_list.append(save_dict)
    
df = pd.DataFrame(dict_list, index=ocr_types_dict.keys()).round(3)
df['overall'] = ((1-df['text_block_Edit_dist'])*100 + df['display_formula_CDM'] + df['table_TEDS'])/3
# df.to_csv('./overall.csv')

df


Unnamed: 0,text_block_Edit_dist,display_formula_CDM,table_TEDS,table_TEDS_structure_only,reading_order_Edit_dist,overall
end2end,0.356,0,80.012,91.455,0.217,48.137333


In [3]:
# Block 2 - Document types (Edit_dist difficulty by PDF source).

dict_list = []

for ocr_type in ocr_types:
    result_path = os.path.join(result_folder, f'{ocr_type}_{match_name}_metric_result.json')
    
    with open(result_path, 'r') as f:
        result = json.load(f)
    
    # for category_type in result.keys():
    dict_list.append(result['text_block']['page']['Edit_dist'])
df2 = pd.DataFrame(dict_list, index=official_names)

reordered_df2 = df2.round(3)

selected_columns = reordered_df2[
    [
        'data_source: book',
        'data_source: PPT2PDF',
        'data_source: research_report',
        'data_source: colorful_textbook',
        'data_source: exam_paper',
        'data_source: magazine',
        'data_source: academic_literature',
        'data_source: note',
        'data_source: newspaper'
    ]
].copy()

# calculate mean
selected_columns['mean'] = reordered_df2['ALL']
# selected_columns['variance'] = selected_columns.var(axis=1)
# selected_columns.to_csv('./data_source.csv')

selected_columns


Unnamed: 0,data_source: book,data_source: PPT2PDF,data_source: research_report,data_source: colorful_textbook,data_source: exam_paper,data_source: magazine,data_source: academic_literature,data_source: note,data_source: newspaper,mean
end2end,0.43,0.004,0.613,0.49,0.039,0.362,0.123,0.582,0.562,0.356


In [4]:
# Block 3 - Visual degradations (fuzzy scan / watermark / background impact, mean + var).

def get_columns(df, required_columns):
    for col in required_columns:
        if col not in df.columns:
            df[col] = float('nan')

    selected_columns = df[required_columns].copy()
    return selected_columns

dict_list = []
dict_list_var = []

for ocr_type in ocr_types:
    result_path = os.path.join(result_folder, f'{ocr_type}_{match_name}_metric_result.json')
    
    with open(result_path, 'r') as f:
        result = json.load(f)
    
    dict_list.append(result['text_block']['page']['Edit_dist'])
    dict_list_var.append(result['text_block']['page'].get('Edit_dist_var', {}))

df2 = pd.DataFrame(dict_list, index=official_names)
df2_var = pd.DataFrame(dict_list_var, index=official_names)
reordered_df2 = df2.round(3)
reordered_df2_var = df2_var.round(3)

required_columns = [
    'layout: single_column',
    'layout: double_column',
    'layout: three_column',
    'layout: other_layout'
]

selected_columns = get_columns(reordered_df2, required_columns)
selected_columns_var = get_columns(reordered_df2_var, required_columns)

selected_columns.rename(columns={
    'fuzzy_scan': 'fuzzy_scan_mean',
    'watermark': 'watermark_mean',
    'colorful_backgroud': 'colorful_backgroud_mean'
}, inplace=True)

selected_columns_var.rename(columns={
    'fuzzy_scan': 'fuzzy_scan_var',
    'watermark': 'watermark_var',
    'colorful_backgroud': 'colorful_backgroud_var'
}, inplace=True)

result = pd.merge(selected_columns, selected_columns_var, left_index=True, right_index=True)

result = result.reindex(columns=[
    'fuzzy_scan_mean','fuzzy_scan_var',
    'watermark_mean','watermark_var',
    'colorful_backgroud_mean','colorful_backgroud_var'
])

result


Unnamed: 0,fuzzy_scan_mean,fuzzy_scan_var,watermark_mean,watermark_var,colorful_backgroud_mean,colorful_backgroud_var
end2end,,,,,,


In [5]:
# Block 4 - Layout & reading order (single/double/three/other columns, mean + var).

dict_list = []
dict_list_var = []

for ocr_type in ocr_types:
    result_path = os.path.join(result_folder, f'{ocr_type}_{match_name}_metric_result.json')
    with open(result_path, 'r') as f:
        result = json.load(f)

    dict_list.append(result['reading_order']['page']['Edit_dist'])
    dict_list_var.append(result['reading_order']['page'].get('Edit_dist_var', {}))

df3 = pd.DataFrame(dict_list, index=official_names)
df3_var = pd.DataFrame(dict_list_var, index=official_names)

reordered_df3 = df3.round(3)
reordered_df3_var = df3_var.round(3)

required_cols = [
    'layout: single_column',
    'layout: double_column',
    'layout: three_column',
    'layout: other_layout'
]

selected_columns3 = reordered_df3.reindex(columns=required_cols).copy()
selected_columns3_var = reordered_df3_var.reindex(columns=required_cols).copy()

selected_columns3.rename(columns={
    'layout: single_column':'layout: single_column_mean',
    'layout: double_column':'layout: double_column_mean',
    'layout: three_column':'layout: three_column_mean',
    'layout: other_layout':'layout: other_layout_mean'
}, inplace=True)

selected_columns3_var.rename(columns={
    'layout: single_column':'layout: single_column_var',
    'layout: double_column':'layout: double_column_var',
    'layout: three_column':'layout: three_column_var',
    'layout: other_layout':'layout: other_layout_var'
}, inplace=True)

result = pd.merge(selected_columns3, selected_columns3_var, left_index=True, right_index=True)

result = result.reindex(columns=[
    'layout: single_column_mean','layout: single_column_var',
    'layout: double_column_mean','layout: double_column_var',
    'layout: three_column_mean','layout: three_column_var',
    'layout: other_layout_mean','layout: other_layout_var'
])

result


Unnamed: 0,layout: single_column_mean,layout: single_column_var,layout: double_column_mean,layout: double_column_var,layout: three_column_mean,layout: three_column_var,layout: other_layout_mean,layout: other_layout_var
end2end,0.084,0.022,0.299,0.078,0.0,,0.6,0.32


In [6]:
# Block 5 - Text attributes (language + background color effects on Edit_dist).

dict_list = []

for ocr_type in ocr_types:
    result_path = os.path.join(result_folder, f'{ocr_type}_{match_name}_metric_result.json')
    
    with open(result_path, 'r') as f:
        result = json.load(f)
    
    dict_list.append(result['text_block']['group']['Edit_dist'])

df4 = pd.DataFrame(dict_list, index=official_names)
df4 = df4.round(3)

selected_columns = df4[
    [
        'text_language: text_english',
        'text_language: text_simplified_chinese',
        'text_language: text_en_ch_mixed',
        'text_background: white',
        'text_background: single_colored',
        'text_background: multi_colored'
    ]
]

# selected_columns.to_csv('.text_attribute.csv')
selected_columns


Unnamed: 0,text_language: text_english,text_language: text_simplified_chinese,text_language: text_en_ch_mixed,text_background: white,text_background: single_colored,text_background: multi_colored
end2end,0.093,0.735,0.318,0.531,0.341,0.36


In [None]:
# Block 6 - Table attributes (language, lines, spans, equations, background, orientation -> TEDS).

dict_list = []

for ocr_type in ocr_types:
    result_path = os.path.join(result_folder, f'{ocr_type}_{match_name}_metric_result.json')
    
    with open(result_path, 'r') as f:
        result = json.load(f)
    
    dict_list.append(result['table']['group']['TEDS'])

df4 = pd.DataFrame(dict_list, index=official_names)
df4 = df4 * 100
df4 = df4.round(1)

selected_columns = df4[
    [
        'language: table_en',
        'language: table_simplified_chinese',
        'language: table_en_ch_mixed',
        'line: full_line',
        'line: less_line',
        'line: fewer_line',
        'line: wireless_line',
        'with_span: True',
        'with_span: False',
        'include_equation: True',
        'include_equation: False',
        'include_background: True',
        'include_background: False',
        'table_layout: vertical',
        'table_layout: horizontal'
    ]
]

# selected_columns.to_csv('./table_attribute.csv')
selected_columns


In [None]:
# Block 7 - Text OCR comparison (language, background, rotation -> Edit_dist).

ocr_types = ['OmniDocBench_easyocr_text_ocr', 'OmniDocBench_openocr_text_ocr']
official_names = ocr_types

result_folder = '../result'

dict_list = []

for ocr_type in ocr_types:
    result_path = os.path.join(result_folder, f'{ocr_type}_metric_result.json')
    
    with open(result_path, 'r') as f:
        result = json.load(f)
    
    dict_list.append(result['group']['Edit_dist'])

df4 = pd.DataFrame(dict_list, index=official_names)
df4 = df4.round(3)

selected_columns = df4[
    [
        'text_language: text_english',
        'text_language: text_simplified_chinese',
        'text_language: text_en_ch_mixed',
        'text_background: white',
        'text_background: single_colored',
        'text_background: multi_colored',
        'text_rotate: normal',
        'text_rotate: rotate90',
        'text_rotate: rotate270',
        'text_rotate: horizontal'
    ]
]

# selected_columns.to_csv('.text_attribute.csv')
selected_columns


In [None]:
# Block 8 - Table recognition (same attributes as Block 6 -> TEDS).

ocr_types = ['OmniDocBench_rapidtable_ocr']
official_names = ocr_types

result_folder = '../result'

dict_list = []

for ocr_type in ocr_types:
    result_path = os.path.join(result_folder, f'{ocr_type}_metric_result.json')
    
    with open(result_path, 'r') as f:
        result = json.load(f)
    
    dict_list.append(result['table']['group']['TEDS'])

df4 = pd.DataFrame(dict_list, index=official_names)
df4 = df4 * 100
df4 = df4.round(1)

selected_columns = df4[
    [
        'language: table_en',
        'language: table_simplified_chinese',
        'language: table_en_ch_mixed',
        'line: full_line',
        'line: less_line',
        'line: fewer_line',
        'line: wireless_line',
        'with_span: True',
        'with_span: False',
        'include_equation: True',
        'include_equation: False',
        'include_background: True',
        'include_background: False',
        'table_layout: vertical',
        'table_layout: horizontal'
    ]
]

# selected_columns.to_csv('./table_attribute.csv')
selected_columns
