In [0]:
!pip install docling
!pip install -qU pip docling transformers

In [0]:
from pathlib import Path
from pyspark.dbutils import DBUtils
import re, os

def get_repo_root() -> Path:
    dbutils = DBUtils(spark)
    notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()

    # Extrahiere /Workspace/Repos/<user>/<repo>
    match = re.match(r"^(\/Workspace\/Repos\/[^\/]+\/[^\/]+)", notebook_path)
    if match:
        return Path(match.group(1))
    else:
        return Path(os.getcwd())

repo_root = get_repo_root()
print("📁 Repo root:", repo_root)

# Eine Ebene höher (wie "cd ..")
parent_path = repo_root.parent

# Zwei Ebenen höher
two_up = repo_root.parent.parent

print("⬆️ Eine Ebene höher:", parent_path)
print("⬆️ root:", parent_path)

doc_root = parent_path / "documents"
out_dir = parent_path / "out"
product_name = "Nano 33 BLE"
product_category = "Nano Family"
file_name  = "Nano_33_BLE_datasheet.pdf"

pdf_path = doc_root / product_category / product_name / "Nano_33_BLE_datasheet.pdf"
print("⬆️ pdf pfad:",pdf_path)

out_path_docling = out_dir / product_category / product_name / "docling_chunks.jsonl"
print("⬆️ oout_path_docling:",out_path_docling)

out_path_langchain = out_dir / product_category / product_name / "langchain_chunks.jsonl"
print("⬆️ out_path_langchain:",out_path_langchain)

code_path_langchain = parent_path / "langchain_chunking.py"
print("⬆️code_path_langchain:",code_path_langchain)

code_path_docling= parent_path / "docling_chunking.py"
print("⬆️code_path_langchain:",code_path_docling)


In [0]:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
pdf_options = PdfPipelineOptions()
pdf_options.do_ocr = False                    # No OCR - pure text extraction only   # noqa: E501
pdf_options.generate_page_images = False      # No page images  # noqa: E501
pdf_options.generate_picture_images = False   # Ignore pictures completely  # noqa: E501
pdf_options.generate_table_images = False     # Keep tables as text/markdown, not images  # noqa: E501

        # Configure format options
format_options = {
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pdf_options
            )
        }

        # Initialize document converter
converter_2 = DocumentConverter(
            format_options=format_options
        )
result_2 = converter_2.convert(pdf_path)
doc_2 = result_2.document

In [0]:
print(doc_2.pages)
print(doc_2.export_to_markdown()[:1000])

In [0]:
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from transformers import AutoTokenizer

from docling.chunking import HybridChunker

EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
MAX_TOKENS = 800  # set to a small number for illustrative purposes

tokenizer = HuggingFaceTokenizer(
    tokenizer=AutoTokenizer.from_pretrained(EMBED_MODEL_ID),
    max_tokens=MAX_TOKENS,  # optional, by default derived from `tokenizer` for HF case
)
chunker = HybridChunker(
    tokenizer=tokenizer,
    merge_peers=True,  # optional, defaults to True
)
chunk_iter = chunker.chunk(dl_doc=doc_2)
chunks = list(chunk_iter)
chunks

In [0]:
# Cleaning-Funktion
# ---------------------------
def clean_text(t: str) -> str:
    if not t:
        return ""
    t = re.sub(r"(\w)-\n(\w)", r"\1\2", t)
    t = t.replace("\r", "")
    t = re.sub(r"\n{2,}", "\n", t)
    t = re.sub(r"[ \t]{2,}", " ", t)

    def noisy(line: str) -> bool:
        s = line.strip()
        if not s:
            return True
        non_alpha = sum(1 for ch in s if not ch.isalpha())
        return (non_alpha / max(1, len(s))) > 0.6

    lines = [ln for ln in t.split("\n") if not noisy(ln)]
    t = "\n".join(lines)
    t = re.sub(r"^(Table|Figure)\s*\d+[:.\-]\s.*$", "", t, flags=re.IGNORECASE | re.MULTILINE)
    t = re.sub(r"^\s*\|.*\|\s*$", "", t, flags=re.MULTILINE)
    t = re.sub(r"^\s*[-=]{3,}\s*$", "", t, flags=re.MULTILINE)
    return t.strip()


In [0]:

total_chunks = len(chunks)
parts = pdf_path.parts
category = parts[-3]
product = parts[-2]
records = []
for i, ch in enumerate(chunks):
    text_raw = clean_text(ch.text or "")
    if len(text_raw) < 30:
            continue

    context = clean_text(chunker.contextualize(chunk=ch))

        # Kategorie und Abschnitt bestimmen (heuristisch)
    section = None
    try:
        hierarchy = getattr(ch, "hierarchy_path", None)
        if hierarchy and isinstance(hierarchy, list) and len(hierarchy) > 0:
            section = hierarchy[-1].get("title", None)
    except Exception:
        pass

        # Semantische Dichte = Tokens / Zeichen (gibt an, wie „kompakt“ der Text ist)
    n_tokens = tokenizer.count_tokens(context)
    semantic_density = round(n_tokens / max(1, len(context)), 4)

    rec = {
            "category": category,
            "chunk_id": f"{Path(pdf_path).stem}::c{i}",
            "chunk_size": n_tokens,
            "chunk_type": "contextualized",
            "product": product,
            "section": section,
            "semantic_density": semantic_density,
            "text": context,
            "total_chunks": total_chunks
        }
    records.append(rec)


In [0]:
out_path_docling

In [0]:

# In JSONL speichern
import json
with open (out_path_docling, "w", encoding="utf-8") as f:
    for r in records:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print(f"[OK] {len(records)} Chunks gespeichert unter: {out_path_docling}")




In [0]:
df_docling = spark.read.json (f"file:{out_path_docling}")
df_docling.select("text", "chunk_id", "chunk_size").display()