diff --git a/CHANGELOG.md b/CHANGELOG.md index f36908738..35a2154fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,12 @@ - Fixed an edge case where the LLM can output a property with type 'map', which was causing errors during import as it is not a valid property type in Neo4j. +### Added + +- Document node is now always created when running SimpleKGPipeline, even if `from_pdf=False`. +- Document metadata is exposed in SimpleKGPipeline run method. + + ## 1.9.1 ### Fixed diff --git a/docs/source/user_guide_kg_builder.rst b/docs/source/user_guide_kg_builder.rst index b574171bb..65ccbd82d 100644 --- a/docs/source/user_guide_kg_builder.rst +++ b/docs/source/user_guide_kg_builder.rst @@ -219,6 +219,15 @@ chunk overlap in the text splitter component: ) +Run Parameters +-------------- + +SimpleKGPipeline also accepts addition runtime parameters: + +- ``document_metadata`` (dict): each item will be saved as a property attached to the ``Document`` node. + + + Using a Config file =================== diff --git a/examples/build_graph/simple_kg_builder_from_pdf.py b/examples/build_graph/simple_kg_builder_from_pdf.py index 2cfc85134..d3b2948f8 100644 --- a/examples/build_graph/simple_kg_builder_from_pdf.py +++ b/examples/build_graph/simple_kg_builder_from_pdf.py @@ -54,7 +54,12 @@ async def define_and_run_pipeline( }, neo4j_database=DATABASE, ) - return await kg_builder.run_async(file_path=str(file_path)) + return await kg_builder.run_async( + file_path=str(file_path), + # optional, add document metadata, each item will + # be saved as a property of the Document node + # document_metadata={"author": "J. K. Rowling"}, + ) async def main() -> PipelineResult: diff --git a/examples/build_graph/simple_kg_builder_from_text.py b/examples/build_graph/simple_kg_builder_from_text.py index 548cbd9eb..18adde4ac 100644 --- a/examples/build_graph/simple_kg_builder_from_text.py +++ b/examples/build_graph/simple_kg_builder_from_text.py @@ -79,7 +79,15 @@ async def define_and_run_pipeline( from_pdf=False, neo4j_database=DATABASE, ) - return await kg_builder.run_async(text=TEXT) + return await kg_builder.run_async( + text=TEXT, + # optional, specify file path for the Document node + # if not, a random name will be generated + # file_path="my_document.txt" + # optional, add document metadata, each item will + # be saved as a property of the Document node + # document_metadata={"author": "Frank Herbert"}, + ) async def main() -> PipelineResult: diff --git a/src/neo4j_graphrag/experimental/components/lexical_graph.py b/src/neo4j_graphrag/experimental/components/lexical_graph.py index bbe99b80b..fb5583c04 100644 --- a/src/neo4j_graphrag/experimental/components/lexical_graph.py +++ b/src/neo4j_graphrag/experimental/components/lexical_graph.py @@ -113,6 +113,7 @@ def create_document_node(self, document_info: DocumentInfo) -> Neo4jNode: properties={ "path": document_info.path, "createdAt": datetime.datetime.now(datetime.timezone.utc).isoformat(), + "document_type": document_info.document_type, **document_metadata, }, ) diff --git a/src/neo4j_graphrag/experimental/components/pdf_loader.py b/src/neo4j_graphrag/experimental/components/pdf_loader.py index 979cdb665..8bfe2502e 100644 --- a/src/neo4j_graphrag/experimental/components/pdf_loader.py +++ b/src/neo4j_graphrag/experimental/components/pdf_loader.py @@ -89,5 +89,6 @@ async def run( document_info=DocumentInfo( path=filepath, metadata=self.get_document_metadata(text, metadata), + document_type="pdf", ), ) diff --git a/src/neo4j_graphrag/experimental/components/types.py b/src/neo4j_graphrag/experimental/components/types.py index 363767ef3..b5ce07706 100644 --- a/src/neo4j_graphrag/experimental/components/types.py +++ b/src/neo4j_graphrag/experimental/components/types.py @@ -38,6 +38,7 @@ class DocumentInfo(DataModel): path: str metadata: Optional[Dict[str, str]] = None uid: str = Field(default_factory=lambda: str(uuid.uuid4())) + document_type: Optional[str] = None @property def document_id(self) -> str: diff --git a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py index dc875d7c2..929111898 100644 --- a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py +++ b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py @@ -14,6 +14,7 @@ # limitations under the License. from __future__ import annotations +from collections import defaultdict from typing import ( Any, ClassVar, @@ -336,37 +337,40 @@ def _get_connections(self) -> list[ConnectionDefinition]: return connections def get_run_params(self, user_input: dict[str, Any]) -> dict[str, Any]: - run_params = {} - if self.lexical_graph_config: - run_params["extractor"] = { - "lexical_graph_config": self.lexical_graph_config, - } - run_params["writer"] = { - "lexical_graph_config": self.lexical_graph_config, - } - run_params["pruner"] = { - "lexical_graph_config": self.lexical_graph_config, - } text = user_input.get("text") file_path = user_input.get("file_path") - if not ((text is None) ^ (file_path is None)): - # exactly one of text or user_input must be set + if text is None and file_path is None: + # user must provide either text or file_path or both raise PipelineDefinitionError( - "Use either 'text' (when from_pdf=False) or 'file_path' (when from_pdf=True) argument." + "At least one of `text` (when from_pdf=False) or `file_path` (when from_pdf=True) argument must be provided." ) + run_params: dict[str, dict[str, Any]] = defaultdict(dict) + if self.lexical_graph_config: + run_params["extractor"]["lexical_graph_config"] = self.lexical_graph_config + run_params["writer"]["lexical_graph_config"] = self.lexical_graph_config + run_params["pruner"]["lexical_graph_config"] = self.lexical_graph_config if self.from_pdf: if not file_path: raise PipelineDefinitionError( "Expected 'file_path' argument when 'from_pdf' is True." ) - run_params["pdf_loader"] = {"filepath": file_path} + run_params["pdf_loader"]["filepath"] = file_path + run_params["pdf_loader"]["metadata"] = user_input.get("document_metadata") else: if not text: raise PipelineDefinitionError( "Expected 'text' argument when 'from_pdf' is False." ) - run_params["splitter"] = {"text": text} + run_params["splitter"]["text"] = text # Add full text to schema component for automatic schema extraction if not self.has_user_provided_schema(): - run_params["schema"] = {"text": text} + run_params["schema"]["text"] = text + run_params["extractor"]["document_info"] = dict( + path=user_input.get( + "file_path", + ) + or "document.txt", + metadata=user_input.get("document_metadata"), + document_type="inline_text", + ) return run_params diff --git a/src/neo4j_graphrag/experimental/pipeline/kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/kg_builder.py index 68f579c8b..b7313b3b0 100644 --- a/src/neo4j_graphrag/experimental/pipeline/kg_builder.py +++ b/src/neo4j_graphrag/experimental/pipeline/kg_builder.py @@ -145,16 +145,26 @@ def __init__( self.runner = PipelineRunner.from_config(config) async def run_async( - self, file_path: Optional[str] = None, text: Optional[str] = None + self, + file_path: Optional[str] = None, + text: Optional[str] = None, + document_metadata: Optional[dict[str, Any]] = None, ) -> PipelineResult: """ Asynchronously runs the knowledge graph building process. Args: - file_path (Optional[str]): The path to the PDF file to process. Required if `from_pdf` is True. + file_path (Optional[str]): The path to the PDF file to process. Required if `from_pdf` is True. If `from_pdf` is False, can be used to set the Document node path property. text (Optional[str]): The text content to process. Required if `from_pdf` is False. + document_metadata (Optional[dict[str, Any]]): The metadata to attach to the document. Returns: PipelineResult: The result of the pipeline execution. """ - return await self.runner.run({"file_path": file_path, "text": text}) + return await self.runner.run( + { + "file_path": file_path, + "text": text, + "document_metadata": document_metadata, + } + ) diff --git a/tests/unit/experimental/components/test_lexical_graph_builder.py b/tests/unit/experimental/components/test_lexical_graph_builder.py index 4621c77e9..788855f61 100644 --- a/tests/unit/experimental/components/test_lexical_graph_builder.py +++ b/tests/unit/experimental/components/test_lexical_graph_builder.py @@ -78,7 +78,11 @@ async def test_lexical_graph_builder_run_with_document() -> None: TextChunk(text="text chunk 1", index=1), ] ), - document_info=DocumentInfo(path="test_lexical_graph", uid=doc_uid), + document_info=DocumentInfo( + path="test_lexical_graph", + uid=doc_uid, + document_type="my_type", + ), ) assert isinstance(result, GraphResult) graph = result.graph @@ -89,6 +93,7 @@ async def test_lexical_graph_builder_run_with_document() -> None: assert document.label == DEFAULT_DOCUMENT_NODE_LABEL assert document.properties["path"] == "test_lexical_graph" assert document.properties["createdAt"] is not None + assert document.properties["document_type"] == "my_type" chunk1 = nodes[1] assert chunk1.label == DEFAULT_CHUNK_NODE_LABEL chunk2 = nodes[2] diff --git a/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py b/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py index 40f5dae34..2c8d2a0fd 100644 --- a/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py +++ b/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import re from unittest.mock import Mock, patch import neo4j @@ -286,16 +287,16 @@ def test_simple_kg_pipeline_config_connections_with_er() -> None: def test_simple_kg_pipeline_config_run_params_from_pdf_file_path() -> None: config = SimpleKGPipelineConfig(from_pdf=True) assert config.get_run_params({"file_path": "my_file"}) == { - "pdf_loader": {"filepath": "my_file"} + "pdf_loader": {"filepath": "my_file", "metadata": None} } def test_simple_kg_pipeline_config_run_params_from_text_text() -> None: config = SimpleKGPipelineConfig(from_pdf=False) - assert config.get_run_params({"text": "my text"}) == { - "splitter": {"text": "my text"}, - "schema": {"text": "my text"}, - } + run_params = config.get_run_params({"text": "my text"}) + assert run_params["splitter"] == {"text": "my text"} + assert run_params["schema"] == {"text": "my text"} + assert run_params["extractor"]["document_info"]["path"] == "document.txt" def test_simple_kg_pipeline_config_run_params_from_pdf_text() -> None: @@ -314,22 +315,13 @@ def test_simple_kg_pipeline_config_run_params_from_text_file_path() -> None: def test_simple_kg_pipeline_config_run_params_no_file_no_text() -> None: config = SimpleKGPipelineConfig(from_pdf=False) - with pytest.raises(PipelineDefinitionError) as excinfo: + with pytest.raises( + PipelineDefinitionError, + match=re.escape( + "At least one of `text` (when from_pdf=False) or `file_path` (when from_pdf=True) argument must be provided." + ), + ): config.get_run_params({}) - assert ( - "Use either 'text' (when from_pdf=False) or 'file_path' (when from_pdf=True) argument." - in str(excinfo) - ) - - -def test_simple_kg_pipeline_config_run_params_both_file_and_text() -> None: - config = SimpleKGPipelineConfig(from_pdf=False) - with pytest.raises(PipelineDefinitionError) as excinfo: - config.get_run_params({"text": "my text", "file_path": "my file"}) - assert ( - "Use either 'text' (when from_pdf=False) or 'file_path' (when from_pdf=True) argument." - in str(excinfo) - ) def test_simple_kg_pipeline_config_process_schema_with_precedence_legacy() -> None: diff --git a/tests/unit/experimental/pipeline/test_kg_builder.py b/tests/unit/experimental/pipeline/test_kg_builder.py index 62abc1c41..8634a1e4e 100644 --- a/tests/unit/experimental/pipeline/test_kg_builder.py +++ b/tests/unit/experimental/pipeline/test_kg_builder.py @@ -18,7 +18,9 @@ import neo4j import pytest from neo4j_graphrag.embeddings import Embedder -from neo4j_graphrag.experimental.components.types import LexicalGraphConfig +from neo4j_graphrag.experimental.components.types import ( + LexicalGraphConfig, +) from neo4j_graphrag.experimental.pipeline.exceptions import PipelineDefinitionError from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline from neo4j_graphrag.experimental.pipeline.pipeline import PipelineResult @@ -49,11 +51,16 @@ async def test_knowledge_graph_builder_document_info_with_file(_: Mock) -> None: "run", return_value=PipelineResult(run_id="test_run", result=None), ) as mock_run: - await kg_builder.run_async(file_path=file_path) + await kg_builder.run_async( + file_path=file_path, document_metadata={"source": "google drive"} + ) pipe_inputs = mock_run.call_args[1]["data"] assert "pdf_loader" in pipe_inputs - assert pipe_inputs["pdf_loader"] == {"filepath": file_path} + assert pipe_inputs["pdf_loader"] == { + "filepath": file_path, + "metadata": {"source": "google drive"}, + } assert "extractor" not in pipe_inputs @@ -81,11 +88,19 @@ async def test_knowledge_graph_builder_document_info_with_text(_: Mock) -> None: "run", return_value=PipelineResult(run_id="test_run", result=None), ) as mock_run: - await kg_builder.run_async(text=text_input) + await kg_builder.run_async( + text=text_input, + file_path="my_document.txt", + document_metadata={"source": "google drive"}, + ) pipe_inputs = mock_run.call_args[1]["data"] assert "splitter" in pipe_inputs assert pipe_inputs["splitter"] == {"text": text_input} + assert pipe_inputs["extractor"]["document_info"]["path"] == "my_document.txt" + assert pipe_inputs["extractor"]["document_info"]["metadata"] == { + "source": "google drive" + } @mock.patch( @@ -175,6 +190,6 @@ async def test_knowledge_graph_builder_with_lexical_graph_config(_: Mock) -> Non pipe_inputs = mock_run.call_args[1]["data"] assert "extractor" in pipe_inputs - assert pipe_inputs["extractor"] == { - "lexical_graph_config": lexical_graph_config - } + assert pipe_inputs["extractor"]["lexical_graph_config"] == lexical_graph_config + assert pipe_inputs["extractor"]["document_info"] is not None + assert pipe_inputs["extractor"]["document_info"]["path"] == "document.txt"