# Process JSONL Claims to .txt for as training corpus

In [None]:
INPUT_FOLDER = "../data"
OUTPUT_FILE  = "../claim1.txt"
REFS_MODE    = "remove"        # "keep" | "remove" | "replace" with <IDX>
ADD_EOS      = True             # add <eos> token to end of each line
ALL_CLAIMS   = False            # True = all claims, False = only claim 1
NFKC         = True             # normalize to NFKC form 
DEDUPE       = True             # deduplicate lines
MIN_LEN      = 20               # 0 = no cap 
MAX_LEN      = 4000             # 0 = no cap

# Charset
MIN_PRINTABLE_RATIO = 0.98      # drop lines with < this fraction of printable chars
MAX_NONASCII_RATIO  = 0.50      # drop lines with > this fraction of non-ASCII (0 disables)

In [2]:
import re, json, unicodedata, string
from pathlib import Path
from tqdm.auto import tqdm

# --- Patterns ---
REF_PARENS = r"""[\(\[\{]\s*(?:\d+[A-Za-z]*[′'″]*)(?:\s*,\s*\d+[A-Za-z]*[′'″]*)*\s*[\)\]\}]"""
REF_REGEX  = re.compile(REF_PARENS)
WS         = re.compile(r"\s+")

PRINTABLE_SET = set(string.printable) | {"’","“","”","–","—","·","•","°","µ","²","³","±"}  # allow a few common unicode

def is_line_charset_ok(s: str) -> bool:
    if not s:
        return False
    # Count printable (allow whitespace) using Python's notion + our extra set
    total = len(s)
    printable = sum((ch in PRINTABLE_SET) or ch.isprintable() for ch in s)
    if printable / total < MIN_PRINTABLE_RATIO:
        return False
    if MAX_NONASCII_RATIO:
        nonascii = sum(ord(ch) > 127 for ch in s)
        if (nonascii / total) > MAX_NONASCII_RATIO:
            return False
    return True

def process_ref_numerals(text: str, mode: str) -> str:
    if mode == "keep":   return text
    if mode == "remove": return REF_REGEX.sub(" ", text)
    if mode == "replace":
        out = REF_REGEX.sub(" <IDX> ", text)
        return re.sub(r"(?:\s*<IDX>\s*){2,}", " <IDX> ", out)
    raise ValueError("REFS_MODE must be 'keep', 'remove', or 'replace'")

def clean_claim(text: str, refs_mode: str, add_eos: bool, nfkc: bool) -> str:
    t = text.strip().replace("\n", " ")
    if nfkc:
        t = unicodedata.normalize("NFKC", t)
    t = process_ref_numerals(t, refs_mode)
    t = WS.sub(" ", t).strip()
    if add_eos and not t.endswith("<EOS>"):
        t = f"{t} <EOS>"
    return t

def process_jsonl_dir(
    in_dir: str,
    out_path: str,
    refs_mode: str = "replace",
    add_eos: bool = True,
    all_claims: bool = False,
    nfkc: bool = True,
    dedupe: bool = True,
    min_len: int = 20,
    max_len: int = 8000,
):
    in_dir = Path(in_dir)
    files = sorted(in_dir.glob("*.jsonl"))
    seen = set()
    written = 0

    with open(out_path, "w", encoding="utf-8") as out_f, tqdm(total=len(files), desc="Files", unit="file") as pbar:
        for fp in files:
            try:
                with open(fp, "r", encoding="utf-8", errors="ignore") as fh:
                    for line in fh:
                        try:
                            data = json.loads(line)
                        except json.JSONDecodeError:
                            continue

                        claims = data.get("c", {})
                        iter_claims = claims.values() if isinstance(claims, dict) else (claims if isinstance(claims, list) else [])
                        for i, claim_text in enumerate(iter_claims):
                            if not isinstance(claim_text, str):
                                continue
                            t = clean_claim(claim_text, refs_mode, add_eos, nfkc)

                            # Length + charset guards
                            if len(t) < min_len or (max_len and len(t) > max_len):
                                if not all_claims and i == 0: break
                                else: continue
                            if not is_line_charset_ok(t):
                                if not all_claims and i == 0: break
                                else: continue

                            if dedupe:
                                if t in seen:
                                    if not all_claims and i == 0: break
                                    else: continue
                                seen.add(t)

                            out_f.write(t + "\n")
                            written += 1
                            if not all_claims:
                                break  # only first claim per record
            finally:
                pbar.update(1)
    return written, len(files)


In [3]:
# RUN
count, nfiles = process_jsonl_dir(
    INPUT_FOLDER, OUTPUT_FILE,
    refs_mode=REFS_MODE,
    add_eos=ADD_EOS,
    all_claims=ALL_CLAIMS,
    nfkc=NFKC,
    dedupe=DEDUPE,
    min_len=MIN_LEN,
    max_len=MAX_LEN,
)
print(f"Processed {nfiles} files; wrote {count} claims → {OUTPUT_FILE}")


Files:   0%|          | 0/314 [00:00<?, ?file/s]

Processed 314 files; wrote 558619 claims → ../claim1.txt


In [6]:
N = 10  # Number of lines to print
with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= N:
            break
        print(line.strip())

An X-ray imaging system , comprising: - a moveable X-ray source and/or a moveable X-ray detector ; - a plurality of object-surface detecting sensors ; - a positioning detection arrangement ; - a processing unit ; and - a display unit ; wherein the object-surface detecting sensors are arranged such to detect object data of objects located between the X-ray source and the X-ray detector; wherein the positioning detection arrangement is provided to detect the current position of the X-ray source and/or the X-ray detector and the position of the object-surface detecting sensors; wherein the processing unit is configured to compute a situation-map of the current spatial situation between the X-ray source and the X-ray detector based on the object data provided by the object-surface detecting sensors and the current position, the situation-map distinguishing at least between empty spaces and spaces occupied by rigid objects; and wherein the situation-map comprises a representation of the X-r