In [1]:
from pathlib import Path
import json, gzip
from typing import List, Dict, Iterable, Union
import pandas as pd
from typing import List, Dict, Any

In [2]:
def _iter_jsonl(path: Path) -> Iterable[Dict]:
    """Yield dicts from a single .jsonl or .jsonl.gz file."""
    opener = gzip.open if path.suffix == ".gz" or path.name.endswith(".jsonl.gz") else open
    mode = "rt" if opener is gzip.open else "r"
    with opener(path, mode, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            # If a writer accidentally dumped arrays/objects per line, normalize:
            if isinstance(obj, dict):
                yield obj
            elif isinstance(obj, list):
                for item in obj:
                    if isinstance(item, dict):
                        yield item
                    else:
                        raise ValueError(f"Non-dict item in list in {path}: {type(item)}")
            else:
                raise ValueError(f"Non-dict JSON in {path}: {type(obj)}")

def load_jsonls(root: Union[str, Path]) -> List[Dict]:
    """Recursively load all *.jsonl / *.jsonl.gz under root into one list of dicts."""
    root = Path(root)
    files = sorted(
        [*root.rglob("*.jsonl"), *root.rglob("*.jsonl.gz")],
        key=lambda p: (p.parent.as_posix(), p.name)
    )
    all_rows: List[Dict] = []
    for fp in files:
        all_rows.extend(_iter_jsonl(fp))
    return all_rows


def def_analyze_docs(
    docs: List[Dict[str, Any]],
    group_name: str,
    clsfix_applied: bool = False
) -> pd.DataFrame:
    """
    Build a DataFrame with:
      - path: filename only (last part of doc['path'])
      - full_text_len: len(doc['text']) if present, else 0
      - page_count: len(doc['metadata']['page_char_idx']) if present, else None
      - page_char_idx: list of character indices if present, else None
      - group: provided group_name
      - beforeclsfix: boolean flag (clsfix_applied)
    """
    rows = []
    for doc in docs:
        raw_path = doc.get('path')
        path = None
        if isinstance(raw_path, str):
            path = raw_path.split('/')[-1]  # keep filename only

        text = doc.get('text', "")
        full_text_len = len(text) if isinstance(text, str) else 0

        page_char_idx = None
        page_count = None
        meta = doc.get('metadata')
        if isinstance(meta, dict):
            pci = meta.get('page_char_idx')
            if isinstance(pci, (list, tuple)):
                page_char_idx = list(pci)
                page_count = len(page_char_idx)

        rows.append({
            'path': path,
            'full_text_len': full_text_len,
            'page_count': page_count,
            'page_char_idx': page_char_idx,
            'group': group_name,
            'beforeclsfix': clsfix_applied,
        })

    return pd.DataFrame(
        rows,
        columns=['path', 'full_text_len', 'page_count', 'page_char_idx', 'group', 'beforeclsfix']
    )

In [3]:
# Paths
p_reference = Path('/lus/flare/projects/FoundEpidem/siebenschuh/adaparse_data/output/pymupdf_reference/parsed_pdfs')
p_doc = Path('/lus/flare/projects/FoundEpidem/siebenschuh/adaparse_data/output/adaparse_small_test_by_doc')
p_doc_fi = Path('/lus/flare/projects/FoundEpidem/siebenschuh/adaparse_data/output/adaparse_small_test_by_doc_with_fillin')
p_page_fi = Path('/lus/flare/projects/FoundEpidem/siebenschuh/adaparse_data/output/adaparse_small_test_by_page_with_fillin')
p_page = Path('/lus/flare/projects/FoundEpidem/siebenschuh/adaparse_data/output/adaparse_small_test_by_page')

In [4]:
# Load as List[document dict]
docs_no_fillin = load_jsonls(p_doc)
docs_with_fillin = load_jsonls(p_doc_fi)
pages_with_fillin = load_jsonls(p_page_fi)
pages_no_fillin = load_jsonls(p_page)

In [5]:
# PyMuPDF
df0 = def_analyze_docs(load_jsonls(p_reference), 'pymupdf')
# AdaParse
df1 = def_analyze_docs(load_jsonls(p_doc), 'doc_nofi')
df2 = def_analyze_docs(load_jsonls(p_doc_fi), 'doc_fi')
df3 = def_analyze_docs(load_jsonls(p_page_fi), 'dpage_fi')
df4 = def_analyze_docs(load_jsonls(p_page), 'page_nofi')

# merge into one
out = pd.concat([df0, df1, df2, df3, df4], axis=0, ignore_index=True)

In [6]:
out#[out['path']=='5_of_20.pdf']

Unnamed: 0,path,full_text_len,page_count,page_char_idx,group,beforeclsfix
0,8_of_20.pdf,12965,5,"[0, 2509, 5249, 7977, 10719]",pymupdf,False
1,16_of_20.pdf,42711,16,"[0, 1037, 4188, 7501, 9894, 12526, 15735, 1852...",pymupdf,False
2,3_of_20.pdf,52512,23,"[0, 1011, 3048, 6408, 9264, 11518, 13675, 1661...",pymupdf,False
3,1_of_20.pdf,4500,2,"[0, 3739]",pymupdf,False
4,14_of_20.pdf,48633,20,"[0, 1273, 3921, 5595, 8646, 11895, 14582, 1748...",pymupdf,False
5,9_of_20.pdf,35564,14,"[0, 2881, 6517, 10275, 13643, 17154, 20093, 22...",pymupdf,False
6,5_of_20.pdf,19026,3,"[0, 5551, 11489]",pymupdf,False
7,12_of_20.pdf,12965,5,"[0, 2509, 5249, 7977, 10719]",pymupdf,False
8,7_of_20.pdf,39233,14,"[0, 3105, 6128, 8877, 11644, 14424, 17555, 198...",pymupdf,False
9,6_of_20.pdf,66757,15,"[0, 4490, 9751, 14423, 19943, 24580, 29637, 33...",pymupdf,False


In [7]:
# VISUAL OVERVIEW
out = out.sort_values(['path', 'group'], na_position='last')

# natural sort by filenames (if 'path' like 'file2' < 'file10')
out = out.sort_values(['path', 'group'], key=lambda col: col.str.extract(r'(\d+)').astype(float).fillna(float('inf'))[0])

out

Unnamed: 0,path,full_text_len,page_count,page_char_idx,group,beforeclsfix
3,1_of_20.pdf,4500,2,"[0, 3739]",pymupdf,False
15,2_of_20.pdf,110303,56,"[0, 1735, 3880, 6094, 8329, 10493, 12804, 1503...",pymupdf,False
2,3_of_20.pdf,52512,23,"[0, 1011, 3048, 6408, 9264, 11518, 13675, 1661...",pymupdf,False
13,4_of_20.pdf,55904,13,"[0, 4431, 10232, 16051, 20455, 24999, 30164, 3...",pymupdf,False
6,5_of_20.pdf,19026,3,"[0, 5551, 11489]",pymupdf,False
9,6_of_20.pdf,66757,15,"[0, 4490, 9751, 14423, 19943, 24580, 29637, 33...",pymupdf,False
8,7_of_20.pdf,39233,14,"[0, 3105, 6128, 8877, 11644, 14424, 17555, 198...",pymupdf,False
0,8_of_20.pdf,12965,5,"[0, 2509, 5249, 7977, 10719]",pymupdf,False
5,9_of_20.pdf,35564,14,"[0, 2881, 6517, 10275, 13643, 17154, 20093, 22...",pymupdf,False
10,10_of_20.pdf,44662,18,"[0, 2940, 6024, 9803, 12917, 15860, 18377, 208...",pymupdf,False


In [None]:
#docs_no_fillin[0]

# get path doc['path']
# get len of full text: len(doc['text'])
# count pages --> doc['metadata']['page_char_idx'] # len of this is pages (if present)

IndexError: list index out of range