In [1]:
from pathlib import Path
import json, gzip
from typing import List, Dict, Iterable, Union
import pandas as pd
from typing import List, Dict, Any

In [4]:
def _iter_jsonl(path: Path) -> Iterable[Dict]:
    """Yield dicts from a single .jsonl or .jsonl.gz file."""
    opener = gzip.open if path.suffix == ".gz" or path.name.endswith(".jsonl.gz") else open
    mode = "rt" if opener is gzip.open else "r"
    with opener(path, mode, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            # If a writer accidentally dumped arrays/objects per line, normalize:
            if isinstance(obj, dict):
                yield obj
            elif isinstance(obj, list):
                for item in obj:
                    if isinstance(item, dict):
                        yield item
                    else:
                        raise ValueError(f"Non-dict item in list in {path}: {type(item)}")
            else:
                raise ValueError(f"Non-dict JSON in {path}: {type(obj)}")

def load_jsonls(root: Union[str, Path]) -> List[Dict]:
    """Recursively load all *.jsonl / *.jsonl.gz under root into one list of dicts."""
    root = Path(root)
    files = sorted(
        [*root.rglob("*.jsonl"), *root.rglob("*.jsonl.gz")],
        key=lambda p: (p.parent.as_posix(), p.name)
    )
    all_rows: List[Dict] = []
    for fp in files:
        all_rows.extend(_iter_jsonl(fp))
    return all_rows


def def_analyze_docs(
    docs: List[Dict[str, Any]],
    group_name: str,
    clsfix_applied: bool = False
) -> pd.DataFrame:
    """
    Build a DataFrame with:
      - path: filename only (last part of doc['path'])
      - full_text_len: len(doc['text']) if present, else 0
      - page_count: len(doc['metadata']['page_char_idx']) if present, else None
      - page_char_idx: list of character indices if present, else None
      - group: provided group_name
      - beforeclsfix: boolean flag (clsfix_applied)
    """
    rows = []
    for doc in docs:
        raw_path = doc.get('path')
        path = None
        if isinstance(raw_path, str):
            path = raw_path.split('/')[-1]  # keep filename only

        text = doc.get('text', "")
        full_text_len = len(text) if isinstance(text, str) else 0

        page_char_idx = None
        page_count = None
        meta = doc.get('metadata')
        if isinstance(meta, dict):
            pci = meta.get('page_char_idx')
            if isinstance(pci, (list, tuple)):
                page_char_idx = list(pci)
                page_count = len(page_char_idx)

        rows.append({
            'path': path,
            'full_text_len': full_text_len,
            'page_count': page_count,
            'page_char_idx': page_char_idx,
            'group': group_name,
            'beforeclsfix': clsfix_applied,
        })

    return pd.DataFrame(
        rows,
        columns=['path', 'full_text_len', 'page_count', 'page_char_idx', 'group', 'beforeclsfix']
    )

In [5]:
# Paths
p_reference = Path('/lus/flare/projects/FoundEpidem/siebenschuh/adaparse_data/output/pymupdf_reference/parsed_pdfs')
p_doc = Path('/lus/flare/projects/FoundEpidem/siebenschuh/adaparse_data/output/adaparse_small_test_by_doc')
p_doc_fi = Path('/lus/flare/projects/FoundEpidem/siebenschuh/adaparse_data/output/adaparse_small_test_by_doc_with_fillin')
p_page_fi = Path('/lus/flare/projects/FoundEpidem/siebenschuh/adaparse_data/output/adaparse_small_test_by_page_with_fillin')
p_page = Path('/lus/flare/projects/FoundEpidem/siebenschuh/adaparse_data/output/adaparse_small_test_by_page')

In [None]:
# Load as List[document dict]
docs_no_fillin = load_jsonls(p_doc)
docs_with_fillin = load_jsonls(p_doc_fi)
pages_with_fillin = load_jsonls(p_page_fi)
pages_no_fillin = load_jsonls(p_page)

In [6]:
# PyMuPDF
df0 = def_analyze_docs(load_jsonls(p_reference), 'pymupdf')
# AdaParse
df1 = def_analyze_docs(load_jsonls(p_doc), 'doc_nofi')
df2 = def_analyze_docs(load_jsonls(p_doc_fi), 'doc_fi')
df3 = def_analyze_docs(load_jsonls(p_page_fi), 'page_fi')
df4 = def_analyze_docs(load_jsonls(p_page), 'page_nofi')

# merge into one
out = pd.concat([df0, df1, df2, df3, df4], axis=0, ignore_index=True)

In [11]:
import re

# VISUAL OVERVIEW
out = out.sort_values(['path', 'group'], na_position='last')

out[12:16]
# path	full_text_len	page_count	page_char_idx	group	beforeclsfix

Unnamed: 0,path,full_text_len,page_count,page_char_idx,group,beforeclsfix
34,13_of_20.pdf,66757,15,"[0, 4490, 9751, 14423, 19943, 24580, 29637, 33...",doc_nofi,False
58,13_of_20.pdf,64526,15,"[0, 4489, 9749, 14420, 19939, 24575, 29631, 33...",page_fi,False
78,13_of_20.pdf,64526,15,"[0, 4489, 9749, 14420, 19939, 24575, 29631, 33...",page_nofi,False
14,13_of_20.pdf,66757,15,"[0, 4490, 9751, 14423, 19943, 24580, 29637, 33...",pymupdf,False


In [8]:
import pandas as pd
import numpy as np

# Preconditions:
# out has columns: ['path','full_text_len','page_count','page_char_idx','group','beforeclsfix']

def percent_tables_by_path(out: pd.DataFrame) -> pd.DataFrame:
    # Keep only rows with required fields present
    req = ['path', 'group', 'full_text_len']
    df = out.dropna(subset=req).copy()

    # One row per (path, group): if multiple rows exist, choose the max length per group
    agg = (
        df.groupby(['path', 'group'], as_index=False)
          .agg(full_text_len=('full_text_len', 'max'),
               page_count=('page_count', 'max'),
               beforeclsfix=('beforeclsfix', 'max'))
    )

    # Keep only paths that have exactly two groups
    counts = agg.groupby('path')['group'].nunique()
    valid_paths = counts[counts == 2].index
    agg = agg[agg['path'].isin(valid_paths)]

    # Pivot to wide: two columns named after the actual group labels
    wide = agg.pivot(index='path', columns='group', values='full_text_len')

    # Identify the two group names (per path, but consistent ordering for formulas)
    # We'll sort the column labels alphabetically as (g1, g2) for deterministic output
    cols = list(wide.columns)
    g1, g2 = sorted(cols, key=lambda x: str(x))

    # Pull per-path metadata back (page_count/beforeclsfix) using max across groups
    meta = (agg.groupby('path', as_index=False)
              .agg(page_count=('page_count','max'),
                   beforeclsfix=('beforeclsfix','max')))

    # Compute metrics
    a = wide[g1].astype(float)
    b = wide[g2].astype(float)

    # Percent change from g1 → g2 (standard “(new-old)/old * 100”)
    pct_change_g1_to_g2 = 100.0 * (b - a) / a.replace(0, np.nan)

    # Symmetric percent difference (relative difference wrt mean), robust & signed
    symmetric_pct_diff = 100.0 * (b - a) / ((a + b) / 2.0).replace(0, np.nan)

    # Winner & ratio
    winner = np.where(b > a, g2, np.where(a > b, g1, 'tie'))
    ratio_best_second = np.where(a >= b, a / np.where(b == 0, np.nan, b),
                                 b / np.where(a == 0, np.nan, a))

    # Build tidy table
    result = pd.DataFrame({
        'path': wide.index,
        f'full_text_len[{g1}]': a.values,
        f'full_text_len[{g2}]': b.values,
        f'pct_change_{g1}_to_{g2}': pct_change_g1_to_g2.values,
        'symmetric_pct_diff': symmetric_pct_diff.values,
        'winner_group': winner,
        'ratio_best_second': ratio_best_second,
    })

    # Attach meta and order columns
    result = (result.merge(meta, on='path', how='left')
                    .sort_values(['path'])
                    .reset_index(drop=True))

    return result

# Usage:
table = percent_tables_by_path(out)
# table.head()


ValueError: not enough values to unpack (expected 2, got 0)

In [None]:
table

In [None]:
#docs_no_fillin[0]

# get path doc['path']
# get len of full text: len(doc['text'])
# count pages --> doc['metadata']['page_char_idx'] # len of this is pages (if present)