## 1) Setup + configuración

In [None]:
from dotenv import load_dotenv
import os
from pathlib import Path


load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
assert OPENAI_API_KEY, "Missing OPENAI_API_KEY at .env"

DATA_DIR = Path("data")
assert DATA_DIR.exists(), "data/ folder does not exist!"

print("✅ Environment OK")

In [None]:
BOE_PDF = DATA_DIR / "BOE-A-2020-8608.pdf"
SENTINEL_PDF = DATA_DIR / "sentinel_secure_services_maual_operativo.pdf"  # noqa

print("BOE exists:", BOE_PDF.exists())
print("Sentinel exists:", SENTINEL_PDF.exists())

## 2) Carga + extracción

In [None]:
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader


def load_pdf(path: Path) -> list[Document]:
    loader = PyPDFLoader(str(path))
    documents = loader.load()
    return documents


docs_boe = load_pdf(BOE_PDF) if BOE_PDF.exists() else []
docs_sentinel = load_pdf(SENTINEL_PDF) if SENTINEL_PDF.exists() else []

docs = docs_boe
print(f"Pages loaded: {len(docs)}")
print(f"Example metadata: {docs[0].metadata}" if docs else None)
print(f"Sample text: {docs[0].page_content[:400]} [...]" if docs else "No docs")

In [None]:
def doc_stats(documents: list[Document]) -> tuple[int, int]:
    total_characters = sum(len(d.page_content) for d in documents)
    average_characters = total_characters / max(1, len(documents))
    return total_characters, average_characters


total_chars, avg_chars = doc_stats(docs)
print(f"Total characters: {total_chars:_}\nAvg characters per page: {avg_chars:_.0f}")

## 3) Chunking

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


def split_docs(documents: list[Document], chunk_size: int = 800, chunk_overlap: int = 120) -> list[Document]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ". ", " ", ""],
    )
    return splitter.split_documents(documents)


chunks_200 = split_docs(docs, chunk_size=200, chunk_overlap=50)
chunks_800 = split_docs(docs, chunk_size=800, chunk_overlap=120)
chunks_2000 = split_docs(docs, chunk_size=2000, chunk_overlap=200)

print(f"Chunks (200): {len(chunks_200)}")
print(f"Chunks (800): {len(chunks_800)}")
print(f"Chunks (2000): {len(chunks_2000)}")

In [None]:
import textwrap


def preview_chunks(chunks: list[Document], n: int = 2, width: int = 100, max_chars: int = 800) -> None:
    for i in range(min(n, len(chunks))):
        md = chunks[i].metadata
        txt = chunks[i].page_content.strip().replace("\n", " ")
        print(f"\n—Chunk {i} | source={md.get('source')}; page={md.get('page')}; len={len(chunks[i].page_content):_}")
        print(textwrap.fill(txt[:max_chars], width=width))


preview_chunks(chunks_2000, n=1, max_chars=800)
preview_chunks(chunks_200, n=1)

## 4) Embeddings + vector store

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma


EMBED_MODEL = "text-embedding-3-small"


def build_chroma(chunks: list[Document], persist_dir="chroma_boe", collection="boe") -> Chroma:
    embeddings = OpenAIEmbeddings(model=EMBED_MODEL)
    db = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=persist_dir,
        collection_name=collection,
    )
    return db


db_800 = build_chroma(chunks_800, persist_dir="chroma_boe_800", collection="boe_800")
print("✅ Chroma built (800).")

In [None]:
def retrieve_with_scores(db: Chroma, query_: str, n_results: int = 4) -> list[tuple[Document, float]]:
    return db.similarity_search_with_score(query_, k=n_results)


def print_chroma_hits(hits: list[tuple[Document, float]], max_chars: int = 600, width: int = 110) -> None:
    print(f"Score: less is better for cosine-distance based search.")
    for i, (doc, score) in enumerate(hits):
        md = doc.metadata
        print(f"\n#{i} score={score:.4f}; page={md.get('page')}; source={md.get('source')}")
        print(textwrap.fill(doc.page_content[:max_chars].replace("\n", " "), width=width))


query = "¿Cuál es el porcentaje máximo de subvención para proyectos en fase comercial?"  # noqa
results = retrieve_with_scores(db_800, query, n_results=4)
print_chroma_hits(results)

In [None]:
db_2000 = build_chroma(chunks_2000, persist_dir="chroma_boe_2000", collection="boe_2000")
db_200 = build_chroma(chunks_200, persist_dir="chroma_boe_200", collection="boe_200")

print("\n\nUsing chunk size 2000:")
results = retrieve_with_scores(db_2000, query, n_results=3)
print_chroma_hits(results)

print("\n\nUsing chunk size 200:")
results = retrieve_with_scores(db_200, query, n_results=3)
print_chroma_hits(results)

## 6) Generación

In [32]:
from langchain_openai import ChatOpenAI


llm = ChatOpenAI(model="gpt-4.1-mini", temperature=0.0)


def answer_no_context(question: str) -> str:
    return llm.invoke(question).content


def answer_with_context(question: str, retrieved_docs: list[Document]) -> str:
    context = "\n\n".join(
        f"[source={d.metadata.get('source')}; page={d.metadata.get('page')}]\n{d.page_content}"
        for d in retrieved_docs
    )

    # noinspection SpellCheckingInspection
    prompt = f"""Usa el siguiente contexto para responder la pregunta.
Si la respuesta no está contenida en el contexto, di "no lo sé".

Contexto:
{context}

Pregunta:
{question}
"""
    return llm.invoke(prompt).content