# Studium Parisiense — Schema Targeting + LLM‑Ready Dataset Builder

This notebook does two things:

1. **Schema targeting (data‑driven):** scans the full Studium JSONL dataset and enumerates:
   - all sections and subfields present
   - meta-entity signals (`places`, `institutions`, `names`, `titles`, `dates`, `id`)
   - an exhaustive predicate inventory (e.g., `origin.birthPlace`, `professionalCareer.royalAdministration`, …)

2. **LLM‑ready dataset generation (one record per person by default):**
   - renders each profile into a clean, consistent **text document**
   - cleans common markup (`$...$`, `&...&`, `%...%`, `word=word`)
   - keeps **structured metadata** (unique places/institutions/names/titles/dates) alongside the text for later linking / evaluation
   - optionally **chunks very long profiles** (toggle), but the default is *one record per person*.

> **Input:** a JSON Lines file (`.jsonl`) where each line is one profile record.  
> **Output:** a JSON Lines file ready to feed into an LLM pipeline.


In [None]:
from __future__ import annotations

import json
import os
import re
from collections import Counter, defaultdict
from typing import Any, Dict, Iterable, List, Optional, Tuple

import pandas as pd

# =========================
# Configuration
# =========================
INPUT_JSONL = "studium_parisiense_dataset.jsonl" 
BASE_URL = "http://studium-parisiense.univ-paris1.fr"

OUTPUT_DIR = "llm_ready_outputs2"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# What to include in the generated text
INCLUDE_BIBLIOGRAPHY = True
INCLUDE_RAW = False  # raw is very verbose; 
INCLUDE_REFERENCES = True   # include 'reference' lines inside each section item
INCLUDE_COMMENTS = True     # include 'comment' lines inside each section item

# Text size policy
ONE_RECORD_PER_PERSON = True 
CHUNK_LONG_RECORDS = False    # if True, long records are split into chunks now
MAX_CHARS_PER_CHUNK = 12000   # used only if CHUNK_LONG_RECORDS=True

# If ONE_RECORD_PER_PERSON=True and CHUNK_LONG_RECORDS=False:
# we still flag very long profiles for future chunking.
WARN_IF_TEXT_GT = 20000

# Output files
OUT_LLM_READY_JSONL = os.path.join(OUTPUT_DIR, "studium_llm_ready_people.jsonl")
OUT_SCHEMA_PREDICATES_CSV = os.path.join(OUTPUT_DIR, "schema_predicates_inventory.csv")
OUT_SECTIONS_CSV = os.path.join(OUTPUT_DIR, "schema_sections_coverage.csv")
OUT_META_KEYS_CSV = os.path.join(OUTPUT_DIR, "schema_meta_keys.csv")
OUT_LONG_RECORDS_CSV = os.path.join(OUTPUT_DIR, "very_long_profiles_to_chunk_later.csv")

print("Config OK")


Config OK


In [2]:
# =========================
# Helpers
# =========================

def read_jsonl(path: str) -> Iterable[Dict[str, Any]]:
    with open(path, "r", encoding="utf-8") as f:
        for line_no, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                yield json.loads(line)
            except json.JSONDecodeError as e:
                raise ValueError(f"Invalid JSON on line {line_no}: {e}") from e


def ensure_list(x: Any) -> List[Any]:
    if x is None:
        return []
    if isinstance(x, list):
        return x
    return [x]


def normalize_markup(text: str) -> str:
    """Best-effort cleanup of Studium inline markup."""
    if text is None:
        return ""
    s = str(text)

    # Replace '=' between word characters with a space: "Città=del=Vaticano" -> "Città del Vaticano"
    s = re.sub(r"(?<=\w)=(?=\w)", " ", s)

    # Remove markup delimiters (keep content)
    s = s.replace("$", "")
    s = s.replace("&", "")
    s = s.replace("%", "")

    # Common stray pattern after removing % (e.g., ':1485' used as '%:1485%')
    s = re.sub(r"\b:\s*(\d{3,4})\b", r"\1", s)

    # Normalize whitespace
    s = s.replace("\t", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s


def safe_get_value(item: Any) -> str:
    if item is None:
        return ""
    if isinstance(item, dict):
        return normalize_markup(item.get("value", ""))
    return normalize_markup(str(item))


def iter_fact_items(section_value: Any) -> Iterable[Tuple[str, Dict[str, Any]]]:
    if not isinstance(section_value, dict):
        return
    for subfield, arr in section_value.items():
        for item in ensure_list(arr):
            if isinstance(item, dict):
                yield subfield, item
            else:
                yield subfield, {"value": str(item), "meta": {}}


def extract_meta_entities_from_item(item: Dict[str, Any]) -> Dict[str, List[Any]]:
    meta = item.get("meta")
    if not isinstance(meta, dict):
        return {}
    return {k: ensure_list(v) for k, v in meta.items()}


def merge_entity_sets(dst: Dict[str, set], src: Dict[str, List[Any]]) -> None:
    for k, vals in src.items():
        if k not in dst:
            dst[k] = set()
        for v in vals:
            if isinstance(v, dict):
                dst[k].add(json.dumps(v, ensure_ascii=False, sort_keys=True))
            else:
                dst[k].add(str(v))


def canonical_person_name(rec: Dict[str, Any]) -> str:
    ident = rec.get("identity", {})
    if isinstance(ident, dict) and "name" in ident:
        name_items = ensure_list(ident.get("name"))
        if name_items:
            v = safe_get_value(name_items[0])
            if v:
                return v
    title = rec.get("title")
    return normalize_markup(title) if title else ""


def absolute_link(link: str) -> str:
    if not link:
        return ""
    if link.startswith("http://") or link.startswith("https://"):
        return link
    return BASE_URL.rstrip("/") + "/" + link.lstrip("/")


def get_activity_mediane(rec: Dict[str, Any]) -> Optional[int]:
    # sometimes in extras, sometimes in identity
    extras = rec.get("extras")
    if isinstance(extras, dict) and isinstance(extras.get("activityMediane"), int):
        return extras["activityMediane"]
    ident = rec.get("identity")
    if isinstance(ident, dict) and isinstance(ident.get("activityMediane"), list):
        # identity.activityMediane usually appears as list of items with 'value' or a raw int
        for it in ident["activityMediane"]:
            if isinstance(it, dict):
                v = it.get("value")
                if isinstance(v, int):
                    return v
                if isinstance(v, str) and v.strip().isdigit():
                    return int(v.strip())
            elif isinstance(it, int):
                return it
    return None


print("Helpers loaded")


Helpers loaded


In [3]:
# =========================
# 1) Schema targeting (data-driven)
# =========================

TOP_LEVEL_IGNORE = {"_id", "reference", "title", "link", "raw"}  # we keep 'extras' because it contains activityMediane

section_counts = Counter()
predicate_counts = Counter()
meta_key_counts = Counter()
predicate_meta_counts: Dict[str, Counter] = defaultdict(Counter)

extras_key_counts = Counter()

n_records = 0

for rec in read_jsonl(INPUT_JSONL):
    n_records += 1

    # extras keys
    extras = rec.get("extras")
    if isinstance(extras, dict):
        extras_key_counts.update(extras.keys())

    # Which top-level sections exist?
    for k in rec.keys():
        if k in TOP_LEVEL_IGNORE or k == "extras":
            continue
        section_counts[k] += 1

    # Predicates: section.subfield
    for section, sec_val in rec.items():
        if section in TOP_LEVEL_IGNORE or section == "extras":
            continue
        if not isinstance(sec_val, dict):
            continue

        for subfield, item in iter_fact_items(sec_val):
            pred = f"{section}.{subfield}"
            predicate_counts[pred] += 1

            meta = item.get("meta")
            if isinstance(meta, dict):
                for mk in meta.keys():
                    meta_key_counts[mk] += 1
                    predicate_meta_counts[pred][mk] += 1

print(f"Records scanned: {n_records:,}")
print("Extras keys:", dict(extras_key_counts))

# Save schema tables
df_sections = pd.DataFrame(section_counts.most_common(), columns=["section", "records_with_section"])
df_sections["coverage_pct"] = (df_sections["records_with_section"] / n_records * 100).round(2)
df_sections.to_csv(OUT_SECTIONS_CSV, index=False)

rows = []
for pred, cnt in predicate_counts.most_common():
    meta_keys = dict(predicate_meta_counts[pred].most_common())
    rows.append({
        "predicate": pred,
        "records_with_predicate": cnt,
        "meta_keys_observed": json.dumps(meta_keys, ensure_ascii=False),
    })
df_pred = pd.DataFrame(rows)
df_pred.to_csv(OUT_SCHEMA_PREDICATES_CSV, index=False)

df_meta = pd.DataFrame(meta_key_counts.most_common(), columns=["meta_key", "count"])
df_meta.to_csv(OUT_META_KEYS_CSV, index=False)

display(df_sections.head(20))
display(df_meta)


Records scanned: 20,149
Extras keys: {'activityMediane': 20045}


Unnamed: 0,section,records_with_section,coverage_pct
0,identity,20149,100.0
1,bibliography,20127,99.89
2,curriculum,19614,97.34
3,origin,11193,55.55
4,ecclesiasticalCareer,10924,54.22
5,professionalCareer,6740,33.45
6,relationalInsertion,6257,31.05
7,textualProduction,1562,7.75
8,assets,1361,6.75
9,politicalCareer,1108,5.5


Unnamed: 0,meta_key,count
0,dates,115631
1,places,63626
2,institutions,62969
3,names,51789
4,titles,25621
5,isComment,20153
6,isLink,20149
7,cotes,52


## Target schema


In [4]:
# =========================
# 2) Render each person into a LLM-ready text
# =========================

SECTION_ORDER = [
    "identity",
    "origin",
    "curriculum",
    "ecclesiasticalCareer",
    "professionalCareer",
    "relationalInsertion",
    "textualProduction",
    "assets",
    "politicalCareer",
    "travels",
    "commissions",
    "orality",
    "distinctiveSign",
    "otherActivities",
    "bibliography",
]

SECTION_TITLES = {
    "identity": "IDENTITY",
    "origin": "ORIGIN",
    "curriculum": "CURRICULUM",
    "ecclesiasticalCareer": "ECCLESIASTICAL CAREER",
    "professionalCareer": "PROFESSIONAL CAREER",
    "relationalInsertion": "RELATIONAL INSERTION",
    "textualProduction": "TEXTUAL PRODUCTION",
    "assets": "ASSETS",
    "politicalCareer": "POLITICAL CAREER",
    "travels": "TRAVELS",
    "commissions": "COMMISSIONS",
    "orality": "ORALITY",
    "distinctiveSign": "DISTINCTIVE SIGN",
    "otherActivities": "OTHER ACTIVITIES",
    "bibliography": "BIBLIOGRAPHY",
}


def render_item_lines(item: Dict[str, Any]) -> List[str]:
    lines: List[str] = []
    v = safe_get_value(item)
    if v:
        lines.append(f"- {v}")

    if INCLUDE_COMMENTS:
        for c in ensure_list(item.get("comment")):
            cv = normalize_markup(str(c))
            if cv:
                lines.append(f"  • comment: {cv}")

    if INCLUDE_REFERENCES:
        for r in ensure_list(item.get("reference")):
            rv = normalize_markup(str(r))
            if rv:
                lines.append(f"  • source: {rv}")

    return lines


def render_generic_section(section_value: Dict[str, Any]) -> List[str]:
    out: List[str] = []
    for subfield in sorted(section_value.keys()):
        items = ensure_list(section_value.get(subfield))
        if not items:
            continue
        out.append(f"{subfield}:")
        for it in items:
            if isinstance(it, dict):
                out.extend(render_item_lines(it))
            else:
                out.append(f"- {normalize_markup(str(it))}")
    return out


def render_textual_production(tp: Dict[str, Any]) -> List[str]:
    out: List[str] = []
    for domain in sorted(tp.keys()):
        domval = tp[domain]
        if not isinstance(domval, dict):
            continue

        header = normalize_markup(domval.get("value", domain))
        out.append(f"domain: {domain}")
        if header:
            out.append(f"- {header}")

        opus_list = ensure_list(domval.get("opus"))
        if opus_list:
            out.append("  opus:")
        for opus in opus_list:
            if not isinstance(opus, dict):
                continue
            main_title = normalize_markup(str(opus.get("mainTitle") or ""))
            if main_title:
                out.append(f"  - work: {main_title}")

            for k in sorted(opus.keys()):
                if k == "mainTitle":
                    continue
                vals = ensure_list(opus.get(k))
                if not vals:
                    continue
                for x in vals:
                    if isinstance(x, dict):
                        xv = safe_get_value(x)
                        if xv:
                            out.append(f"    • {k}: {xv}")
                    else:
                        out.append(f"    • {k}: {normalize_markup(str(x))}")
    return out


def build_llm_text_and_meta(rec: Dict[str, Any]) -> Tuple[str, Dict[str, List[str]]]:
    ref = str(rec.get("reference", "")).strip()
    name = canonical_person_name(rec)
    title = normalize_markup(str(rec.get("title", ""))) if rec.get("title") else ""
    link = absolute_link(rec.get("link", ""))

    activity_mediane = get_activity_mediane(rec)

    meta_entities: Dict[str, set] = {}
    lines: List[str] = []

    # Header (stable)
    lines.append(f"reference: {ref}")
    if name:
        lines.append(f"name: {name}")
    if title and title != name:
        lines.append(f"title: {title}")
    if link:
        lines.append(f"link: {link}")
    if activity_mediane is not None:
        lines.append(f"activityMediane: {activity_mediane}")
    lines.append("")

    for section in SECTION_ORDER:
        if section == "bibliography" and not INCLUDE_BIBLIOGRAPHY:
            continue

        sec_val = rec.get(section)
        if sec_val is None or not isinstance(sec_val, dict):
            continue

        if section == "textualProduction":
            section_lines = render_textual_production(sec_val)
        else:
            section_lines = render_generic_section(sec_val)

        if not section_lines:
            continue

        lines.append(f"[{SECTION_TITLES.get(section, section.upper())}]")
        lines.extend(section_lines)
        lines.append("")

        # meta extraction from section fact items
        for _, item in iter_fact_items(sec_val):
            merge_entity_sets(meta_entities, extract_meta_entities_from_item(item))

        # meta extraction inside textualProduction opus
        if section == "textualProduction":
            for _, domval in sec_val.items():
                if isinstance(domval, dict):
                    for opus in ensure_list(domval.get("opus")):
                        if isinstance(opus, dict):
                            for vv in opus.values():
                                for it in ensure_list(vv):
                                    if isinstance(it, dict):
                                        merge_entity_sets(meta_entities, extract_meta_entities_from_item(it))

    meta_serializable: Dict[str, List[str]] = {k: sorted(list(v)) for k, v in meta_entities.items()}
    text = "\n".join(lines).strip()
    return text, meta_serializable


print("Renderer ready")


Renderer ready


In [5]:
# =========================
# 3) Export LLM-ready JSONL
# =========================

def chunk_text(text: str, max_chars: int) -> List[str]:
    if len(text) <= max_chars:
        return [text]
    parts = text.split("\n\n")
    chunks: List[str] = []
    cur = ""
    for p in parts:
        if not cur:
            cur = p
        elif len(cur) + 2 + len(p) <= max_chars:
            cur += "\n\n" + p
        else:
            chunks.append(cur)
            cur = p
    if cur:
        chunks.append(cur)

    final: List[str] = []
    for c in chunks:
        if len(c) <= max_chars:
            final.append(c)
        else:
            for i in range(0, len(c), max_chars):
                final.append(c[i:i+max_chars])
    return final


n_written = 0
long_rows = []

with open(OUT_LLM_READY_JSONL, "w", encoding="utf-8") as out:
    for rec in read_jsonl(INPUT_JSONL):
        ref = str(rec.get("reference", "")).strip()
        name = canonical_person_name(rec)
        link = absolute_link(rec.get("link", ""))
        text, meta_entities = build_llm_text_and_meta(rec)

        if ONE_RECORD_PER_PERSON and not CHUNK_LONG_RECORDS:
            row = {
                "reference": ref,
                "name": name,
                "link": link,
                "text": text,
                "meta_entities": meta_entities,
                "text_len": len(text),
            }
            out.write(json.dumps(row, ensure_ascii=False) + "\n")
            n_written += 1

            if len(text) > WARN_IF_TEXT_GT:
                long_rows.append({
                    "reference": ref,
                    "name": name,
                    "link": link,
                    "text_len": len(text),
                })

        else:
            chunks = chunk_text(text, MAX_CHARS_PER_CHUNK)
            for idx, ch in enumerate(chunks):
                row = {
                    "reference": ref,
                    "name": name,
                    "link": link,
                    "chunk_id": f"{ref}::chunk{idx:03d}",
                    "chunk_index": idx,
                    "n_chunks": len(chunks),
                    "text": ch,
                    "meta_entities": meta_entities,
                    "text_len": len(ch),
                }
                out.write(json.dumps(row, ensure_ascii=False) + "\n")
                n_written += 1

print(f"Wrote {n_written:,} rows to: {OUT_LLM_READY_JSONL}")

if long_rows:
    df_long = pd.DataFrame(long_rows).sort_values("text_len", ascending=False)
    df_long.to_csv(OUT_LONG_RECORDS_CSV, index=False)
    print(f"Flagged {len(long_rows):,} very long profiles for future chunking: {OUT_LONG_RECORDS_CSV}")
    display(df_long.head(20))
else:
    print("No profiles exceeded WARN_IF_TEXT_GT.")


Wrote 20,149 rows to: llm_ready_outputs2\studium_llm_ready_people.jsonl
Flagged 132 very long profiles for future chunking: llm_ready_outputs2\very_long_profiles_to_chunk_later.csv


Unnamed: 0,reference,name,link,text_len
126,12000,THOMAS de Aquino,http://studium-parisiense.univ-paris1.fr/indiv...,921477
10,50875,AEGIDIUS Romanus,http://studium-parisiense.univ-paris1.fr/indiv...,344970
77,54530,HUGO de Sancto Victore,http://studium-parisiense.univ-paris1.fr/indiv...,266786
17,1533,BONAVENTURA de Bagnoregio,http://studium-parisiense.univ-paris1.fr/indiv...,213028
3,385,ALBERTUS Magnus,http://studium-parisiense.univ-paris1.fr/indiv...,204686
111,58754,ROBERTUS Grosseteste,http://studium-parisiense.univ-paris1.fr/indiv...,174624
92,51944,NICOLAUS de Lyra,http://studium-parisiense.univ-paris1.fr/indiv...,156589
131,52376,VINCENTIUS Belvacensis,http://studium-parisiense.univ-paris1.fr/indiv...,138567
1,16804,ARNALDUS de Villanova,http://studium-parisiense.univ-paris1.fr/indiv...,137887
100,56805,PETRUS Lombardus,http://studium-parisiense.univ-paris1.fr/indiv...,131912


In [6]:
# =========================
# 4) Quick validation / QA
# =========================

refs = set()
lens = []

with open(OUT_LLM_READY_JSONL, "r", encoding="utf-8") as f:
    for line in f:
        row = json.loads(line)
        refs.add(row["reference"])
        lens.append(row["text_len"])

s = pd.Series(lens)
print("Unique references in output:", len(refs))
print("Text length percentiles:", {p: int(s.quantile(p/100)) for p in [50, 75, 90, 95, 99]})
print("Max text_len:", int(s.max()))


Unique references in output: 20149
Text length percentiles: {50: 1236, 75: 1722, 90: 2905, 95: 4476, 99: 13024}
Max text_len: 921477
