# PDF Processing Pipeline — Self-contained Notebook (GPU-ready)

This notebook is **fully self-contained**. All pipeline code is embedded inline —
no external `pipeline/` package is needed. It installs every dependency, mounts
Google Drive, finds all PDFs in the folder you specify, and runs the full flow:

**PDF → Markdown → NLP → Chunking → Vector DB → Manifest**


In [None]:
import importlib.util
import subprocess
import sys


def _has(mod: str) -> bool:
    return importlib.util.find_spec(mod) is not None


PACKAGES = [
    ("docling", "docling"),
    ("tqdm", "tqdm"),
    ("spacy", "spacy"),
    ("rake-nltk", "rake_nltk"),
    ("vaderSentiment", "vaderSentiment"),
    ("scikit-learn", "sklearn"),
    ("nltk", "nltk"),
    ("qdrant-client", "qdrant_client"),
    ("fastembed", "fastembed"),
    ("chromadb", "chromadb"),
    ("sentence-transformers", "sentence_transformers"),
]

missing = [pip for pip, mod in PACKAGES if not _has(mod)]
if missing:
    print(f"Installing: {missing}")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *missing])
else:
    print("All packages already installed.")

if not _has("en_core_web_sm"):
    print("Downloading spaCy model: en_core_web_sm")
    subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])

print("Dependency setup complete.")

In [None]:
import sys
from pathlib import Path

if "google.colab" in sys.modules:
    from google.colab import drive

    drive.mount("/content/drive")

# ╔══════════════════════════════════════════════════════════════╗
# ║  USER CONFIGURATION — edit these before running             ║
# ╚══════════════════════════════════════════════════════════════╝
FOLDER_NAME = "YOUR_DRIVE_FOLDER_NAME"
BACKEND = "qdrant"  # "qdrant" or "chroma"
RUN_NLP = True

DRIVE_ROOT = Path("/content/drive/MyDrive")
OUTPUT_BASE = Path("/content/output")
QDRANT_PATH = Path("/content/qdrant_data")

assert FOLDER_NAME and FOLDER_NAME != "YOUR_DRIVE_FOLDER_NAME", "Set FOLDER_NAME first"
assert BACKEND in {"qdrant", "chroma"}, "BACKEND must be 'qdrant' or 'chroma'"
assert DRIVE_ROOT.exists(), f"Drive root not found: {DRIVE_ROOT}"

In [None]:
import json
import logging
import time
import traceback
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Optional

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger("pipeline")

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
EMBEDDING_DIM = 384
MAX_TOKENS = 512
COLLECTION_NAME = "docling_documents"


# ---------------------------------------------------------------------------
# Data model
# ---------------------------------------------------------------------------
@dataclass
class DocRecord:
    """Tracks conversion results and metadata for a single PDF."""

    filename: str
    filepath: str
    drive_file_id: str = ""
    markdown: str = ""
    num_pages: int = 0
    num_tables: int = 0
    num_figures: int = 0
    title: str = ""
    conversion_time_s: float = 0.0
    status: str = "pending"
    error: Optional[str] = None


print("Models and constants loaded.")

In [None]:
def is_colab() -> bool:
    """Detect if running inside Google Colab."""
    return "google.colab" in sys.modules


def ensure_output_dirs(output_dir: Path) -> tuple[Path, Path]:
    """Create and return (markdown_dir, db_dir) under *output_dir*."""
    md_dir = output_dir / "markdown"
    db_dir = output_dir / "vector_db"
    md_dir.mkdir(parents=True, exist_ok=True)
    db_dir.mkdir(parents=True, exist_ok=True)
    return md_dir, db_dir


def save_manifest(
    output_dir: Path,
    records: list[DocRecord],
    nlp_results: dict[str, dict[str, Any]],
) -> Path:
    """Write document_manifest.json and return its path."""
    manifest: list[dict[str, Any]] = []
    for record in records:
        entry: dict[str, Any] = {
            "filename": record.filename,
            "filepath": record.filepath,
            "title": record.title,
            "status": record.status,
            "num_pages": record.num_pages,
            "num_tables": record.num_tables,
            "num_figures": record.num_figures,
            "conversion_time_s": record.conversion_time_s,
            "error": record.error,
        }
        if record.filename in nlp_results:
            entry["nlp_analysis"] = nlp_results[record.filename]
        manifest.append(entry)

    manifest_path = output_dir / "document_manifest.json"
    with open(manifest_path, "w", encoding="utf-8") as fh:
        json.dump(manifest, fh, indent=2, ensure_ascii=False, default=str)
    return manifest_path


def discover_pdfs(folder: Path) -> list[Path]:
    """Recursively find all PDF files under *folder*, sorted by name."""
    if not folder.exists():
        return []
    return sorted(folder.rglob("*.pdf"))


print("Utility functions loaded.")

In [None]:
def create_vlm_converter() -> tuple[Any, str]:
    """Build a Docling DocumentConverter with VLM pipeline."""
    import time as _time

    t0 = _time.time()
    log.info("create_vlm_converter: importing docling modules ...")

    from docling.datamodel.base_models import InputFormat
    from docling.datamodel.pipeline_options import VlmConvertOptions, VlmPipelineOptions
    from docling.document_converter import DocumentConverter, PdfFormatOption
    from docling.pipeline.vlm_pipeline import VlmPipeline

    log.info(f"create_vlm_converter: imports done in {_time.time() - t0:.2f}s")

    t1 = _time.time()
    try:
        log.info("create_vlm_converter: trying smoldocling preset ...")
        vlm_options = VlmConvertOptions.from_preset("smoldocling")
        vlm_name = "SmolDocling"
        log.info(
            f"create_vlm_converter: smoldocling loaded in {_time.time() - t1:.2f}s"
        )
    except Exception as exc:
        log.warning(
            f"create_vlm_converter: smoldocling failed ({exc}), trying granite_docling ..."
        )
        vlm_options = VlmConvertOptions.from_preset("granite_docling")
        vlm_name = "GraniteDocling"
        log.info(
            f"create_vlm_converter: granite_docling loaded in {_time.time() - t1:.2f}s"
        )

    t2 = _time.time()
    pipeline_options = VlmPipelineOptions(vlm_options=vlm_options)
    log.info(
        f"create_vlm_converter: VlmPipelineOptions done in {_time.time() - t2:.2f}s"
    )

    t3 = _time.time()
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=VlmPipeline,
                pipeline_options=pipeline_options,
            ),
        }
    )
    log.info(
        f"create_vlm_converter: DocumentConverter created in {_time.time() - t3:.2f}s"
    )
    log.info(
        f"Docling VLM converter initialized ({vlm_name}) — total {_time.time() - t0:.2f}s."
    )
    return converter, vlm_name


def convert_single_pdf(converter: Any, pdf_path: Path) -> tuple[DocRecord, Any]:
    """Convert one PDF and return (DocRecord, DoclingDocument | None)."""
    log.info(
        f"convert_single_pdf: START — {pdf_path.name} "
        f"({pdf_path.stat().st_size if pdf_path.exists() else 'MISSING'} bytes)"
    )
    record = DocRecord(filename=pdf_path.name, filepath=str(pdf_path))
    doc = None
    t0 = time.time()
    try:
        result = converter.convert(source=str(pdf_path))
        doc = result.document
        record.markdown = doc.export_to_markdown()
        record.num_pages = len(result.pages) if hasattr(result, "pages") else 0
        record.num_tables = sum(1 for _ in doc.tables) if hasattr(doc, "tables") else 0
        record.num_figures = (
            sum(1 for _ in doc.pictures) if hasattr(doc, "pictures") else 0
        )
        record.title = doc.name if hasattr(doc, "name") and doc.name else pdf_path.stem
        record.status = "success"
        log.info(
            f"convert_single_pdf: SUCCESS — pages={record.num_pages}, "
            f"tables={record.num_tables}, figures={record.num_figures}"
        )
    except Exception:
        record.status = "error"
        record.error = traceback.format_exc()
        log.error(f"convert_single_pdf: ERROR — {record.error}")
    finally:
        record.conversion_time_s = round(time.time() - t0, 2)
        log.info(
            f"convert_single_pdf: DONE — {pdf_path.name} in {record.conversion_time_s}s"
        )
    return record, doc


def convert_pdfs(
    converter: Any,
    pdf_files: list[Path],
    md_output_dir: Path,
) -> tuple[list[DocRecord], dict[str, Any]]:
    """Batch-convert PDFs. Returns (records, {filename: DoclingDocument})."""
    records: list[DocRecord] = []
    docling_docs: dict[str, Any] = {}

    for pdf_path in pdf_files:
        record, doc = convert_single_pdf(converter, pdf_path)
        records.append(record)
        if record.status == "success" and doc is not None:
            md_file = md_output_dir / f"{pdf_path.stem}.md"
            md_file.write_text(record.markdown, encoding="utf-8")
            docling_docs[record.filename] = doc

    success = [r for r in records if r.status == "success"]
    failed = [r for r in records if r.status == "error"]
    log.info(f"Conversion: {len(success)} succeeded, {len(failed)} failed")
    return records, docling_docs


print("Conversion functions loaded.")

In [None]:
_nlp_model = None
_rake_model = None
_vader_model = None


def _get_spacy():
    global _nlp_model
    if _nlp_model is None:
        import spacy

        _nlp_model = spacy.load("en_core_web_sm")
    return _nlp_model


def _get_rake():
    global _rake_model
    if _rake_model is None:
        from rake_nltk import Rake

        _rake_model = Rake()
    return _rake_model


def _get_vader():
    global _vader_model
    if _vader_model is None:
        from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

        _vader_model = SentimentIntensityAnalyzer()
    return _vader_model


def init_nlp() -> tuple:
    """Load all NLP models once. Returns (spacy_nlp, rake, vader)."""
    import nltk

    nltk.download("stopwords", quiet=True)
    nltk.download("punkt_tab", quiet=True)
    return _get_spacy(), _get_rake(), _get_vader()


def extract_keywords_rake(text: str, top_n: int = 15) -> list[str]:
    """Extract keywords using RAKE algorithm."""
    rake = _get_rake()
    rake.extract_keywords_from_text(text)
    return rake.get_ranked_phrases()[:top_n]


def extract_entities(text: str, max_chars: int = 100_000) -> dict[str, list[str]]:
    """Extract named entities grouped by label using spaCy."""
    nlp = _get_spacy()
    doc = nlp(text[:max_chars])
    entities: dict[str, set[str]] = {}
    for ent in doc.ents:
        entities.setdefault(ent.label_, set()).add(ent.text.strip())
    return {k: sorted(v)[:20] for k, v in entities.items()}


def extractive_summary(text: str, num_sentences: int = 5) -> str:
    """Fast extractive summary — first N non-trivial sentences."""
    nlp = _get_spacy()
    doc = nlp(text[:50_000])
    sentences = [s.text.strip() for s in doc.sents if len(s.text.strip()) > 40]
    return " ".join(sentences[:num_sentences])


def analyze_sentiment(text: str, analyzer: Any = None) -> dict[str, Any]:
    """Sentiment analysis via VADER (default) or a HuggingFace pipeline."""
    if analyzer is not None:
        result = analyzer(text[:2000])
        return {"label": result[0]["label"], "score": round(result[0]["score"], 4)}

    vader = _get_vader()
    scores = vader.polarity_scores(text[:5000])
    compound = scores["compound"]
    if compound >= 0.05:
        label = "POSITIVE"
    elif compound <= -0.05:
        label = "NEGATIVE"
    else:
        label = "NEUTRAL"
    return {
        "label": label,
        "score": round(abs(compound), 4),
        "compound": round(compound, 4),
        "scores": scores,
    }


def extract_tfidf_topics(texts: list[str], top_n: int = 10) -> list[list[str]]:
    """Extract top TF-IDF terms per document for topic signals."""
    from sklearn.feature_extraction.text import TfidfVectorizer

    if not texts:
        return []
    vectorizer = TfidfVectorizer(
        max_features=5000,
        stop_words="english",
        max_df=0.85,
        min_df=1,
    )
    matrix = vectorizer.fit_transform(texts)
    features = vectorizer.get_feature_names_out()
    topics: list[list[str]] = []
    for i in range(matrix.shape[0]):
        row = matrix[i].toarray().flatten()
        top_idx = row.argsort()[-top_n:][::-1]
        topics.append([features[j] for j in top_idx if row[j] > 0])
    return topics


def classify_document_type(text: str) -> str:
    """Heuristic document type classification based on content signals."""
    text_lower = text[:10_000].lower()
    patterns = {
        "lease": ["lease", "landlord", "tenant", "rent", "premises"],
        "contract": [
            "agreement",
            "parties",
            "hereby",
            "whereas",
            "terms and conditions",
        ],
        "invoice": ["invoice", "amount due", "bill to", "payment terms", "total due"],
        "legal_notice": ["notice", "hereby notified", "pursuant to", "demand"],
        "tax_document": [
            "tax",
            "assessment",
            "property tax",
            "assessed value",
            "taxable",
        ],
        "title_report": ["title", "escrow", "deed", "recording", "conveyance"],
        "amendment": ["amendment", "first amendment", "second amendment", "modify"],
        "letter": ["dear", "sincerely", "regards", "attention"],
        "report": ["report", "findings", "analysis", "recommendation"],
        "insurance": ["insurance", "policy", "premium", "coverage", "claim"],
    }
    scores = {
        doc_type: sum(1 for t in terms if t in text_lower)
        for doc_type, terms in patterns.items()
    }
    best = max(scores, key=scores.get)
    return best if scores[best] >= 2 else "general"


def run_nlp_analysis(
    records: list[DocRecord],
    sentiment_analyzer: Any = None,
) -> dict[str, dict[str, Any]]:
    """Run full NLP pipeline on successful records."""
    successful = [r for r in records if r.status == "success" and r.markdown]
    all_texts = [r.markdown for r in successful]
    all_tfidf = extract_tfidf_topics(all_texts)

    results: dict[str, dict[str, Any]] = {}
    for i, record in enumerate(successful):
        text = record.markdown
        entities = extract_entities(text)
        results[record.filename] = {
            "keywords_rake": extract_keywords_rake(text),
            "named_entities": entities,
            "summary": extractive_summary(text),
            "sentiment": analyze_sentiment(text, analyzer=sentiment_analyzer),
            "tfidf_topics": all_tfidf[i] if i < len(all_tfidf) else [],
            "document_type": classify_document_type(text),
            "word_count": len(text.split()),
            "char_count": len(text),
            "people": entities.get("PERSON", []),
            "organizations": entities.get("ORG", []),
            "dates": entities.get("DATE", []),
            "amounts": entities.get("MONEY", []),
        }
    return results


print("NLP functions loaded.")

In [None]:
def create_chunker(
    embedding_model: str = EMBEDDING_MODEL,
    max_tokens: int = MAX_TOKENS,
) -> Any:
    """Create a Docling HybridChunker aligned to the embedding model tokenizer."""
    from docling.chunking import HybridChunker

    return HybridChunker(
        tokenizer=embedding_model,
        max_tokens=max_tokens,
        merge_peers=True,
    )


def chunk_documents(
    chunker: Any,
    records: list[DocRecord],
    docling_docs: dict[str, Any],
    nlp_results: dict[str, dict[str, Any]],
) -> list[dict[str, Any]]:
    """Chunk all successful documents and return flat list of chunk dicts."""
    all_chunks: list[dict[str, Any]] = []

    for record in records:
        if record.status != "success":
            continue
        doc = docling_docs.get(record.filename)
        if doc is None:
            continue

        analysis = nlp_results.get(record.filename, {})

        try:
            for chunk_idx, chunk in enumerate(chunker.chunk(dl_doc=doc)):
                contextualized_text = chunker.contextualize(chunk)
                chunk_data = {
                    "id": f"{record.filename}::chunk_{chunk_idx:04d}",
                    "text": contextualized_text,
                    "raw_text": chunk.text,
                    "source_file": record.filename,
                    "source_path": record.filepath,
                    "doc_title": record.title,
                    "num_pages": record.num_pages,
                    "num_tables": record.num_tables,
                    "num_figures": record.num_figures,
                    "chunk_index": chunk_idx,
                    "headings": (
                        " > ".join(chunk.meta.headings) if chunk.meta.headings else ""
                    ),
                    "keywords": ", ".join(analysis.get("keywords_rake", [])[:10]),
                    "tfidf_topics": ", ".join(analysis.get("tfidf_topics", [])[:10]),
                    "sentiment_label": analysis.get("sentiment", {}).get("label", ""),
                    "sentiment_score": analysis.get("sentiment", {}).get("score", 0.0),
                    "doc_summary": analysis.get("summary", "")[:500],
                    "word_count": analysis.get("word_count", 0),
                }
                all_chunks.append(chunk_data)
        except Exception as exc:
            log.warning(f"Chunking failed for {record.filename}: {exc}")
            continue

    return all_chunks


print("Chunking functions loaded.")

In [None]:
def create_chroma_collection(
    db_dir: Path,
    collection_name: str = COLLECTION_NAME,
    embedding_model: str = EMBEDDING_MODEL,
) -> tuple[Any, Any]:
    """Create persistent ChromaDB client + collection."""
    import chromadb
    from chromadb.utils import embedding_functions

    client = chromadb.PersistentClient(path=str(db_dir))
    embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=embedding_model,
    )
    collection = client.get_or_create_collection(
        name=collection_name,
        embedding_function=embedding_fn,
        metadata={"hnsw:space": "cosine"},
    )
    return client, collection


def insert_chunks(
    collection: Any,
    chunks: list[dict[str, Any]],
    batch_size: int = 50,
) -> int:
    """Upsert chunks into a ChromaDB collection. Returns total count."""
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i : i + batch_size]
        ids = [c["id"] for c in batch]
        documents = [c["text"] for c in batch]
        metadatas = [
            {
                "source_file": c["source_file"],
                "source_path": c["source_path"],
                "doc_title": c["doc_title"],
                "num_pages": c["num_pages"],
                "num_tables": c["num_tables"],
                "num_figures": c["num_figures"],
                "chunk_index": c["chunk_index"],
                "headings": c["headings"],
                "keywords": c["keywords"],
                "tfidf_topics": c["tfidf_topics"],
                "sentiment_label": c["sentiment_label"],
                "sentiment_score": c["sentiment_score"],
                "doc_summary": c["doc_summary"],
                "word_count": c["word_count"],
            }
            for c in batch
        ]
        collection.upsert(ids=ids, documents=documents, metadatas=metadatas)
    return collection.count()


def build_qdrant_collection(
    qdrant_path: Path,
    chunker: Any,
    records: list,
    docling_docs: dict[str, Any],
    nlp_results: dict[str, dict],
    collection_name: str = COLLECTION_NAME,
    embedding_model_id: str = EMBEDDING_MODEL,
    embedding_dim: int = EMBEDDING_DIM,
) -> int:
    """Chunk documents and insert into Qdrant with FastEmbed embeddings."""
    from fastembed import TextEmbedding
    from qdrant_client import QdrantClient
    from qdrant_client.models import Distance, PointStruct, VectorParams

    client = QdrantClient(path=str(qdrant_path))

    if client.collection_exists(collection_name):
        client.delete_collection(collection_name)

    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=embedding_dim, distance=Distance.COSINE),
    )

    successful = [r for r in records if r.status == "success"]
    all_documents: list[str] = []
    all_metadata: list[dict] = []

    for record in successful:
        doc = docling_docs.get(record.filename)
        if doc is None:
            continue
        analysis = nlp_results.get(record.filename, {})

        for chunk_idx, chunk in enumerate(chunker.chunk(dl_doc=doc)):
            ctx_text = chunker.contextualize(chunk)
            metadata = {
                "text": ctx_text,
                "source_file": record.filename,
                "source_path": record.filepath,
                "doc_title": record.title,
                "chunk_index": chunk_idx,
                "headings": (
                    " > ".join(chunk.meta.headings) if chunk.meta.headings else ""
                ),
                "keywords": ", ".join(analysis.get("keywords_rake", [])[:10]),
                "tfidf_topics": ", ".join(analysis.get("tfidf_topics", [])[:10]),
                "document_type": analysis.get("document_type", ""),
                "sentiment_label": analysis.get("sentiment", {}).get("label", ""),
                "sentiment_compound": analysis.get("sentiment", {}).get(
                    "compound", 0.0
                ),
                "doc_summary": analysis.get("summary", "")[:500],
                "people": ", ".join(analysis.get("people", [])[:10]),
                "organizations": ", ".join(analysis.get("organizations", [])[:10]),
                "dates": ", ".join(analysis.get("dates", [])[:10]),
                "amounts": ", ".join(analysis.get("amounts", [])[:10]),
                "word_count": analysis.get("word_count", 0),
            }
            all_documents.append(ctx_text)
            all_metadata.append(metadata)

    if not all_documents:
        log.warning("No chunks to insert")
        client.close()
        return 0

    log.info(f"Embedding {len(all_documents)} chunks with {embedding_model_id}...")
    embedding_model = TextEmbedding(embedding_model_id)
    embeddings = list(embedding_model.embed(all_documents))

    batch_size = 64
    for i in range(0, len(all_documents), batch_size):
        batch_end = min(i + batch_size, len(all_documents))
        points = [
            PointStruct(
                id=i + j,
                vector=embeddings[i + j].tolist(),
                payload=all_metadata[i + j],
            )
            for j in range(batch_end - i)
        ]
        client.upsert(collection_name=collection_name, points=points)

    count = client.count(collection_name=collection_name).count
    client.close()
    return count


print("Vector store functions loaded.")

In [None]:
from datetime import datetime


def resolve_folder_by_name(drive_root: Path, folder_name: str) -> Path:
    """Find a folder by name under drive_root (direct child first, then recursive)."""
    if folder_name.startswith("/"):
        candidate = Path(folder_name)
        if candidate.exists() and candidate.is_dir():
            return candidate

    direct = drive_root / folder_name
    if direct.exists() and direct.is_dir():
        return direct

    matches = sorted(
        [p for p in drive_root.rglob("*") if p.is_dir() and p.name == folder_name]
    )
    if not matches:
        raise FileNotFoundError(
            f"No folder named '{folder_name}' found under {drive_root}"
        )

    if len(matches) > 1:
        print("Multiple folders found with same name; using first match:")
        for idx, path in enumerate(matches[:10], start=1):
            print(f"  {idx}. {path}")

    return matches[0]


target_folder = resolve_folder_by_name(DRIVE_ROOT, FOLDER_NAME)
run_slug = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = OUTPUT_BASE / f"{target_folder.name}_{run_slug}"
md_dir, db_dir = ensure_output_dirs(output_dir)
nlp_dir = output_dir / "nlp"
nlp_dir.mkdir(parents=True, exist_ok=True)

pdf_files = discover_pdfs(target_folder)
if not pdf_files:
    raise RuntimeError("No PDFs found in selected folder")

print(f"Resolved folder: {target_folder}")
print(f"PDF count:       {len(pdf_files)}")
print(f"Output dir:      {output_dir}")

In [None]:
converter, vlm_name = create_vlm_converter()
records, docling_docs = convert_pdfs(
    converter=converter,
    pdf_files=pdf_files,
    md_output_dir=md_dir,
)

success = [r for r in records if r.status == "success"]
failed = [r for r in records if r.status == "error"]
print(f"VLM preset:  {vlm_name}")
print(f"Conversion:  {len(success)} success, {len(failed)} failed")

nlp_results: dict[str, dict] = {}
if RUN_NLP:
    _ = init_nlp()
    nlp_results = run_nlp_analysis(records)

    for filename, analysis in nlp_results.items():
        nlp_file = nlp_dir / f"{Path(filename).stem}_nlp.json"
        with open(nlp_file, "w", encoding="utf-8") as handle:
            json.dump(analysis, handle, indent=2, default=str)

print(f"NLP analysis: {len(nlp_results)} document(s)")

In [None]:
chunker = create_chunker()

if BACKEND == "qdrant":
    vector_count = build_qdrant_collection(
        qdrant_path=QDRANT_PATH,
        chunker=chunker,
        records=records,
        docling_docs=docling_docs,
        nlp_results=nlp_results,
    )
else:
    chunks = chunk_documents(chunker, records, docling_docs, nlp_results)
    _, collection = create_chroma_collection(db_dir)
    vector_count = insert_chunks(collection, chunks)

manifest_path = save_manifest(output_dir, records, nlp_results)

total_time = sum(r.conversion_time_s for r in records)
print("=" * 60)
print("PIPELINE COMPLETE")
print(f"  PDFs processed:  {len(records)}")
print(f"  Succeeded:       {len(success)}")
print(f"  Failed:          {len(failed)}")
print(f"  Vectors stored:  {vector_count}")
print(f"  Markdown dir:    {md_dir}")
print(f"  Manifest:        {manifest_path}")
print(f"  Total conv time: {total_time:.1f}s")

In [None]:
if failed:
    print("\nFailed files:")
    for record in failed:
        print(f"- {record.filename}")
        print((record.error or "unknown error")[:1000])
        print("-" * 80)
else:
    print("All files processed successfully.")