# Graph RAG ingestion pipeline demo

This notebook walks through a full ingestion run that loads every document in `tests/test_data`, extracts entities/relationships, and writes both vector and graph stores. Run the cells sequentially to provision dependencies, configure the LLM caller, and persist the resulting artifacts.

## Prerequisites

1. Install the project dependencies and ensure you can import the `ragdoll` package.
2. Export the credentials required by your chosen LLM/embedding providers (for example `OPENAI_API_KEY`).
3. (Optional) Clean up `data/vector_stores/graph_rag_demo` and `data/graph_stores/graph_rag_demo` if you want a fresh run.

In [1]:
from pathlib import Path
import sys

def find_project_root(marker: str = "pyproject.toml") -> Path:
    path = Path.cwd().resolve()
    for candidate in (path, *path.parents):
        if (candidate / marker).exists():
            return candidate
    raise RuntimeError("Unable to locate the project root—open this notebook inside the repository.")

PROJECT_ROOT = find_project_root()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

DATA_DIR = PROJECT_ROOT / "data"
VECTOR_STORE_DIR = DATA_DIR / "vector_stores" / "graph_rag_demo"
GRAPH_STORE_DIR = DATA_DIR / "graph_stores" / "graph_rag_demo"
GRAPH_STORE_FILE = GRAPH_STORE_DIR / "graph.pkl"

for directory in (VECTOR_STORE_DIR, GRAPH_STORE_DIR):
    directory.mkdir(parents=True, exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Vector store directory: {VECTOR_STORE_DIR}")
print(f"Graph store directory: {GRAPH_STORE_DIR}")
print(f"Graph store file: {GRAPH_STORE_FILE}")


Project root: C:\dev\RAGdoll
Vector store directory: C:\dev\RAGdoll\data\vector_stores\graph_rag_demo
Graph store directory: C:\dev\RAGdoll\data\graph_stores\graph_rag_demo


In [None]:
import logging
from ragdoll.pipeline import IngestionOptions, IngestionPipeline
from ragdoll.llms import get_llm_caller

logging.basicConfig(level=logging.INFO)

MODEL_NAME = "gpt-4o"
llm_caller = get_llm_caller(MODEL_NAME)
if llm_caller is None:
    raise RuntimeError("Unable to initialise the requested LLM. Check your configuration or API keys.")

print(f"LLM caller ready for {MODEL_NAME}.")

  from .autonotebook import tqdm as notebook_tqdm
USER_AGENT environment variable not set, consider setting it to identify your requests.


LLM caller ready for gpt-3.5-turbo.


In [None]:
from pprint import pprint

test_data_dir = PROJECT_ROOT / "tests" / "test_data"
if not test_data_dir.exists():
    raise FileNotFoundError(f"Could not find {test_data_dir}â€”check your checkout.")

sources = sorted(str(path) for path in test_data_dir.iterdir() if path.is_file())
print(f"Loaded {len(sources)} sources:")
pprint(sources)

In [None]:
options = IngestionOptions(
    batch_size=5,
    parallel_extraction=False,
    extract_entities=True,
    chunking_options={
        "chunk_size": 1000,
        "chunk_overlap": 200,
    },
    # Use Chroma because it can be initialized empty; FAISS requires manual index/docstore wiring.
    vector_store_options={
        "store_type": "chroma",
        "params": {
            "collection_name": "graph_rag_demo",
            "persist_directory": str(VECTOR_STORE_DIR),
        },
    },
    graph_store_options={
        "store_type": "networkx",
        "output_file": str(GRAPH_STORE_FILE),
    },
    entity_extraction_options={
        "entity_types": ["Person", "Organization", "Location", "Date"],
        "relationship_types": ["works_for", "born_in", "located_in"],
    },
    llm_caller=llm_caller,
)

options


In [None]:
pipeline = IngestionPipeline(options=options)
stats = await pipeline.ingest(sources)
stats

In [None]:
graph = getattr(pipeline, "last_graph", None)
if graph is None:
    print("No graph was producedâ€”ensure entity extraction is enabled.")
else:
    print(f"Graph nodes: {graph.number_of_nodes()} | edges: {graph.number_of_edges()}")
    sample_nodes = list(graph.nodes())[:5]
    sample_edges = list(graph.edges(data=True))[:5]
    if sample_nodes:
        print("\nSample nodes:")
        for node in sample_nodes:
            print(" -", node)
    if sample_edges:
        print("\nSample edges:")
        for edge in sample_edges:
            print(" -", edge)

stats