From 3ec6d461eecea12dc466f1d976e9a97d11f56d8c Mon Sep 17 00:00:00 2001 From: estelle Date: Wed, 20 Aug 2025 17:34:41 +0200 Subject: [PATCH 1/8] Create Document node even from text input --- CHANGELOG.md | 6 ++++ .../build_graph/simple_kg_builder_from_pdf.py | 7 ++++- .../simple_kg_builder_from_text.py | 10 +++++- .../template_pipeline/simple_kg_builder.py | 31 ++++++++++--------- .../experimental/pipeline/kg_builder.py | 17 ++++++++-- .../test_simple_kg_builder.py | 10 +++--- .../experimental/pipeline/test_kg_builder.py | 27 +++++++++++----- 7 files changed, 78 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f36908738..35a2154fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,12 @@ - Fixed an edge case where the LLM can output a property with type 'map', which was causing errors during import as it is not a valid property type in Neo4j. +### Added + +- Document node is now always created when running SimpleKGPipeline, even if `from_pdf=False`. +- Document metadata is exposed in SimpleKGPipeline run method. + + ## 1.9.1 ### Fixed diff --git a/examples/build_graph/simple_kg_builder_from_pdf.py b/examples/build_graph/simple_kg_builder_from_pdf.py index 2cfc85134..0f98e0e66 100644 --- a/examples/build_graph/simple_kg_builder_from_pdf.py +++ b/examples/build_graph/simple_kg_builder_from_pdf.py @@ -54,7 +54,12 @@ async def define_and_run_pipeline( }, neo4j_database=DATABASE, ) - return await kg_builder.run_async(file_path=str(file_path)) + return await kg_builder.run_async( + file_path=str(file_path), + # optional, add document metadata, each item will + # be saved as a property of the Document node + document_metadata={"author": "J. K. Rowling"}, + ) async def main() -> PipelineResult: diff --git a/examples/build_graph/simple_kg_builder_from_text.py b/examples/build_graph/simple_kg_builder_from_text.py index 548cbd9eb..330a3a8bd 100644 --- a/examples/build_graph/simple_kg_builder_from_text.py +++ b/examples/build_graph/simple_kg_builder_from_text.py @@ -79,7 +79,15 @@ async def define_and_run_pipeline( from_pdf=False, neo4j_database=DATABASE, ) - return await kg_builder.run_async(text=TEXT) + return await kg_builder.run_async( + text=TEXT, + # optional, specify document path for the Document node + # if not, a random name will be generated + # document_path="my_document.txt" + # optional, add document metadata, each item will + # be saved as a property of the Document node + # document_metadata={"author": "Frank Herbert"}, + ) async def main() -> PipelineResult: diff --git a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py index dc875d7c2..76618efed 100644 --- a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py +++ b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py @@ -14,6 +14,7 @@ # limitations under the License. from __future__ import annotations +from collections import defaultdict from typing import ( Any, ClassVar, @@ -336,17 +337,6 @@ def _get_connections(self) -> list[ConnectionDefinition]: return connections def get_run_params(self, user_input: dict[str, Any]) -> dict[str, Any]: - run_params = {} - if self.lexical_graph_config: - run_params["extractor"] = { - "lexical_graph_config": self.lexical_graph_config, - } - run_params["writer"] = { - "lexical_graph_config": self.lexical_graph_config, - } - run_params["pruner"] = { - "lexical_graph_config": self.lexical_graph_config, - } text = user_input.get("text") file_path = user_input.get("file_path") if not ((text is None) ^ (file_path is None)): @@ -354,19 +344,32 @@ def get_run_params(self, user_input: dict[str, Any]) -> dict[str, Any]: raise PipelineDefinitionError( "Use either 'text' (when from_pdf=False) or 'file_path' (when from_pdf=True) argument." ) + run_params: dict[str, dict[str, Any]] = defaultdict(dict) + if self.lexical_graph_config: + run_params["extractor"]["lexical_graph_config"] = self.lexical_graph_config + run_params["writer"]["lexical_graph_config"] = self.lexical_graph_config + run_params["pruner"]["lexical_graph_config"] = self.lexical_graph_config if self.from_pdf: if not file_path: raise PipelineDefinitionError( "Expected 'file_path' argument when 'from_pdf' is True." ) - run_params["pdf_loader"] = {"filepath": file_path} + run_params["pdf_loader"]["filepath"] = file_path + run_params["pdf_loader"]["metadata"] = user_input.get("document_metadata") else: if not text: raise PipelineDefinitionError( "Expected 'text' argument when 'from_pdf' is False." ) - run_params["splitter"] = {"text": text} + run_params["splitter"]["text"] = text # Add full text to schema component for automatic schema extraction if not self.has_user_provided_schema(): - run_params["schema"] = {"text": text} + run_params["schema"]["text"] = text + run_params["extractor"]["document_info"] = dict( + path=user_input.get( + "document_path", + ) + or "document.txt", + metadata=user_input.get("document_metadata"), + ) return run_params diff --git a/src/neo4j_graphrag/experimental/pipeline/kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/kg_builder.py index 68f579c8b..531331ff4 100644 --- a/src/neo4j_graphrag/experimental/pipeline/kg_builder.py +++ b/src/neo4j_graphrag/experimental/pipeline/kg_builder.py @@ -145,7 +145,11 @@ def __init__( self.runner = PipelineRunner.from_config(config) async def run_async( - self, file_path: Optional[str] = None, text: Optional[str] = None + self, + file_path: Optional[str] = None, + text: Optional[str] = None, + document_path: Optional[str] = None, + document_metadata: Optional[dict[str, Any]] = None, ) -> PipelineResult: """ Asynchronously runs the knowledge graph building process. @@ -153,8 +157,17 @@ async def run_async( Args: file_path (Optional[str]): The path to the PDF file to process. Required if `from_pdf` is True. text (Optional[str]): The text content to process. Required if `from_pdf` is False. + document_path (Optional[str]): The path to the document to process. Required if `from_pdf` is True. + document_metadata (Optional[dict[str, Any]]): The metadata to attach to the document. Returns: PipelineResult: The result of the pipeline execution. """ - return await self.runner.run({"file_path": file_path, "text": text}) + return await self.runner.run( + { + "file_path": file_path, + "text": text, + "document_path": document_path, + "document_metadata": document_metadata, + } + ) diff --git a/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py b/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py index 40f5dae34..d8fd01f8f 100644 --- a/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py +++ b/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py @@ -286,16 +286,16 @@ def test_simple_kg_pipeline_config_connections_with_er() -> None: def test_simple_kg_pipeline_config_run_params_from_pdf_file_path() -> None: config = SimpleKGPipelineConfig(from_pdf=True) assert config.get_run_params({"file_path": "my_file"}) == { - "pdf_loader": {"filepath": "my_file"} + "pdf_loader": {"filepath": "my_file", "metadata": None} } def test_simple_kg_pipeline_config_run_params_from_text_text() -> None: config = SimpleKGPipelineConfig(from_pdf=False) - assert config.get_run_params({"text": "my text"}) == { - "splitter": {"text": "my text"}, - "schema": {"text": "my text"}, - } + run_params = config.get_run_params({"text": "my text"}) + assert run_params["splitter"] == {"text": "my text"} + assert run_params["schema"] == {"text": "my text"} + assert run_params["extractor"]["document_info"]["path"] == "document.txt" def test_simple_kg_pipeline_config_run_params_from_pdf_text() -> None: diff --git a/tests/unit/experimental/pipeline/test_kg_builder.py b/tests/unit/experimental/pipeline/test_kg_builder.py index 62abc1c41..ff8ab3117 100644 --- a/tests/unit/experimental/pipeline/test_kg_builder.py +++ b/tests/unit/experimental/pipeline/test_kg_builder.py @@ -18,7 +18,9 @@ import neo4j import pytest from neo4j_graphrag.embeddings import Embedder -from neo4j_graphrag.experimental.components.types import LexicalGraphConfig +from neo4j_graphrag.experimental.components.types import ( + LexicalGraphConfig, +) from neo4j_graphrag.experimental.pipeline.exceptions import PipelineDefinitionError from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline from neo4j_graphrag.experimental.pipeline.pipeline import PipelineResult @@ -49,11 +51,14 @@ async def test_knowledge_graph_builder_document_info_with_file(_: Mock) -> None: "run", return_value=PipelineResult(run_id="test_run", result=None), ) as mock_run: - await kg_builder.run_async(file_path=file_path) + await kg_builder.run_async( + file_path=file_path, + document_metadata={"source": "google drive"} + ) pipe_inputs = mock_run.call_args[1]["data"] assert "pdf_loader" in pipe_inputs - assert pipe_inputs["pdf_loader"] == {"filepath": file_path} + assert pipe_inputs["pdf_loader"] == {"filepath": file_path, "metadata": {"source": "google drive"}} assert "extractor" not in pipe_inputs @@ -81,11 +86,19 @@ async def test_knowledge_graph_builder_document_info_with_text(_: Mock) -> None: "run", return_value=PipelineResult(run_id="test_run", result=None), ) as mock_run: - await kg_builder.run_async(text=text_input) + await kg_builder.run_async( + text=text_input, + document_path="my_document.txt", + document_metadata={"source": "google drive"}, + ) pipe_inputs = mock_run.call_args[1]["data"] assert "splitter" in pipe_inputs assert pipe_inputs["splitter"] == {"text": text_input} + assert pipe_inputs["extractor"]["document_info"]["path"] == "my_document.txt" + assert pipe_inputs["extractor"]["document_info"]["metadata"] == { + "source": "google drive" + } @mock.patch( @@ -175,6 +188,6 @@ async def test_knowledge_graph_builder_with_lexical_graph_config(_: Mock) -> Non pipe_inputs = mock_run.call_args[1]["data"] assert "extractor" in pipe_inputs - assert pipe_inputs["extractor"] == { - "lexical_graph_config": lexical_graph_config - } + assert pipe_inputs["extractor"]["lexical_graph_config"] == lexical_graph_config + assert pipe_inputs["extractor"]["document_info"] is not None + assert pipe_inputs["extractor"]["document_info"]["path"] == "document.txt" From 9f2252588edebebae70623b0bd64f26064a6f83a Mon Sep 17 00:00:00 2001 From: estelle Date: Thu, 21 Aug 2025 11:00:20 +0200 Subject: [PATCH 2/8] Update doc --- docs/source/user_guide_kg_builder.rst | 10 ++++++++++ examples/build_graph/simple_kg_builder_from_pdf.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/docs/source/user_guide_kg_builder.rst b/docs/source/user_guide_kg_builder.rst index b574171bb..668ef72b1 100644 --- a/docs/source/user_guide_kg_builder.rst +++ b/docs/source/user_guide_kg_builder.rst @@ -219,6 +219,16 @@ chunk overlap in the text splitter component: ) +Run Parameters +-------------- + +SimpleKGPipeline also accepts addition runtime parameters: + +- ``document_path`` (str): only used when ``from_pdf=False``, this is the path property of the ``Document`` node. +- ``document_metadata`` (dict): each item will be saved as a property attached to the ``Document`` node. + + + Using a Config file =================== diff --git a/examples/build_graph/simple_kg_builder_from_pdf.py b/examples/build_graph/simple_kg_builder_from_pdf.py index 0f98e0e66..d3b2948f8 100644 --- a/examples/build_graph/simple_kg_builder_from_pdf.py +++ b/examples/build_graph/simple_kg_builder_from_pdf.py @@ -58,7 +58,7 @@ async def define_and_run_pipeline( file_path=str(file_path), # optional, add document metadata, each item will # be saved as a property of the Document node - document_metadata={"author": "J. K. Rowling"}, + # document_metadata={"author": "J. K. Rowling"}, ) From 1c5b0de4cb22a67969e0868b5e562b5b2f60c183 Mon Sep 17 00:00:00 2001 From: estelle Date: Thu, 21 Aug 2025 11:00:55 +0200 Subject: [PATCH 3/8] Ruff --- tests/unit/experimental/pipeline/test_kg_builder.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/unit/experimental/pipeline/test_kg_builder.py b/tests/unit/experimental/pipeline/test_kg_builder.py index ff8ab3117..d476d9d96 100644 --- a/tests/unit/experimental/pipeline/test_kg_builder.py +++ b/tests/unit/experimental/pipeline/test_kg_builder.py @@ -52,13 +52,15 @@ async def test_knowledge_graph_builder_document_info_with_file(_: Mock) -> None: return_value=PipelineResult(run_id="test_run", result=None), ) as mock_run: await kg_builder.run_async( - file_path=file_path, - document_metadata={"source": "google drive"} + file_path=file_path, document_metadata={"source": "google drive"} ) pipe_inputs = mock_run.call_args[1]["data"] assert "pdf_loader" in pipe_inputs - assert pipe_inputs["pdf_loader"] == {"filepath": file_path, "metadata": {"source": "google drive"}} + assert pipe_inputs["pdf_loader"] == { + "filepath": file_path, + "metadata": {"source": "google drive"}, + } assert "extractor" not in pipe_inputs From 793ff8c2ff7bdc0c72198ec32ec330715075f1ea Mon Sep 17 00:00:00 2001 From: estelle Date: Thu, 21 Aug 2025 18:35:10 +0200 Subject: [PATCH 4/8] Save document_type --- .../experimental/components/lexical_graph.py | 1 + src/neo4j_graphrag/experimental/components/pdf_loader.py | 1 + src/neo4j_graphrag/experimental/components/types.py | 1 + .../pipeline/config/template_pipeline/simple_kg_builder.py | 1 + .../experimental/components/test_lexical_graph_builder.py | 7 ++++++- 5 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/neo4j_graphrag/experimental/components/lexical_graph.py b/src/neo4j_graphrag/experimental/components/lexical_graph.py index bbe99b80b..fb5583c04 100644 --- a/src/neo4j_graphrag/experimental/components/lexical_graph.py +++ b/src/neo4j_graphrag/experimental/components/lexical_graph.py @@ -113,6 +113,7 @@ def create_document_node(self, document_info: DocumentInfo) -> Neo4jNode: properties={ "path": document_info.path, "createdAt": datetime.datetime.now(datetime.timezone.utc).isoformat(), + "document_type": document_info.document_type, **document_metadata, }, ) diff --git a/src/neo4j_graphrag/experimental/components/pdf_loader.py b/src/neo4j_graphrag/experimental/components/pdf_loader.py index 979cdb665..8bfe2502e 100644 --- a/src/neo4j_graphrag/experimental/components/pdf_loader.py +++ b/src/neo4j_graphrag/experimental/components/pdf_loader.py @@ -89,5 +89,6 @@ async def run( document_info=DocumentInfo( path=filepath, metadata=self.get_document_metadata(text, metadata), + document_type="pdf", ), ) diff --git a/src/neo4j_graphrag/experimental/components/types.py b/src/neo4j_graphrag/experimental/components/types.py index 363767ef3..0062593a1 100644 --- a/src/neo4j_graphrag/experimental/components/types.py +++ b/src/neo4j_graphrag/experimental/components/types.py @@ -38,6 +38,7 @@ class DocumentInfo(DataModel): path: str metadata: Optional[Dict[str, str]] = None uid: str = Field(default_factory=lambda: str(uuid.uuid4())) + document_type: Optional[document_type] = None @property def document_id(self) -> str: diff --git a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py index 76618efed..832a70ca5 100644 --- a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py +++ b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py @@ -371,5 +371,6 @@ def get_run_params(self, user_input: dict[str, Any]) -> dict[str, Any]: ) or "document.txt", metadata=user_input.get("document_metadata"), + document_type="inline_text", ) return run_params diff --git a/tests/unit/experimental/components/test_lexical_graph_builder.py b/tests/unit/experimental/components/test_lexical_graph_builder.py index 4621c77e9..788855f61 100644 --- a/tests/unit/experimental/components/test_lexical_graph_builder.py +++ b/tests/unit/experimental/components/test_lexical_graph_builder.py @@ -78,7 +78,11 @@ async def test_lexical_graph_builder_run_with_document() -> None: TextChunk(text="text chunk 1", index=1), ] ), - document_info=DocumentInfo(path="test_lexical_graph", uid=doc_uid), + document_info=DocumentInfo( + path="test_lexical_graph", + uid=doc_uid, + document_type="my_type", + ), ) assert isinstance(result, GraphResult) graph = result.graph @@ -89,6 +93,7 @@ async def test_lexical_graph_builder_run_with_document() -> None: assert document.label == DEFAULT_DOCUMENT_NODE_LABEL assert document.properties["path"] == "test_lexical_graph" assert document.properties["createdAt"] is not None + assert document.properties["document_type"] == "my_type" chunk1 = nodes[1] assert chunk1.label == DEFAULT_CHUNK_NODE_LABEL chunk2 = nodes[2] From 9e151d13a9484b36e6f70013787a429987c1ecd3 Mon Sep 17 00:00:00 2001 From: estelle Date: Wed, 27 Aug 2025 16:23:34 +0200 Subject: [PATCH 5/8] Fix type --- src/neo4j_graphrag/experimental/components/types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/neo4j_graphrag/experimental/components/types.py b/src/neo4j_graphrag/experimental/components/types.py index 0062593a1..b5ce07706 100644 --- a/src/neo4j_graphrag/experimental/components/types.py +++ b/src/neo4j_graphrag/experimental/components/types.py @@ -38,7 +38,7 @@ class DocumentInfo(DataModel): path: str metadata: Optional[Dict[str, str]] = None uid: str = Field(default_factory=lambda: str(uuid.uuid4())) - document_type: Optional[document_type] = None + document_type: Optional[str] = None @property def document_id(self) -> str: From 88f65e56c83b48c4567f29250011a78b52a2b81f Mon Sep 17 00:00:00 2001 From: estelle Date: Tue, 2 Sep 2025 15:31:52 +0200 Subject: [PATCH 6/8] Reuse file_path instead of introducing another document_path parameter --- docs/source/user_guide_kg_builder.rst | 1 - examples/build_graph/simple_kg_builder_from_text.py | 4 ++-- .../config/template_pipeline/simple_kg_builder.py | 6 +++--- src/neo4j_graphrag/experimental/pipeline/kg_builder.py | 5 +---- .../config/template_pipeline/test_simple_kg_builder.py | 10 ---------- tests/unit/experimental/pipeline/test_kg_builder.py | 2 +- 6 files changed, 7 insertions(+), 21 deletions(-) diff --git a/docs/source/user_guide_kg_builder.rst b/docs/source/user_guide_kg_builder.rst index 668ef72b1..65ccbd82d 100644 --- a/docs/source/user_guide_kg_builder.rst +++ b/docs/source/user_guide_kg_builder.rst @@ -224,7 +224,6 @@ Run Parameters SimpleKGPipeline also accepts addition runtime parameters: -- ``document_path`` (str): only used when ``from_pdf=False``, this is the path property of the ``Document`` node. - ``document_metadata`` (dict): each item will be saved as a property attached to the ``Document`` node. diff --git a/examples/build_graph/simple_kg_builder_from_text.py b/examples/build_graph/simple_kg_builder_from_text.py index 330a3a8bd..18adde4ac 100644 --- a/examples/build_graph/simple_kg_builder_from_text.py +++ b/examples/build_graph/simple_kg_builder_from_text.py @@ -81,9 +81,9 @@ async def define_and_run_pipeline( ) return await kg_builder.run_async( text=TEXT, - # optional, specify document path for the Document node + # optional, specify file path for the Document node # if not, a random name will be generated - # document_path="my_document.txt" + # file_path="my_document.txt" # optional, add document metadata, each item will # be saved as a property of the Document node # document_metadata={"author": "Frank Herbert"}, diff --git a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py index 832a70ca5..fef907d87 100644 --- a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py +++ b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py @@ -339,8 +339,8 @@ def _get_connections(self) -> list[ConnectionDefinition]: def get_run_params(self, user_input: dict[str, Any]) -> dict[str, Any]: text = user_input.get("text") file_path = user_input.get("file_path") - if not ((text is None) ^ (file_path is None)): - # exactly one of text or user_input must be set + if text is None and file_path is None: + # use must provide either text or file_path or both raise PipelineDefinitionError( "Use either 'text' (when from_pdf=False) or 'file_path' (when from_pdf=True) argument." ) @@ -367,7 +367,7 @@ def get_run_params(self, user_input: dict[str, Any]) -> dict[str, Any]: run_params["schema"]["text"] = text run_params["extractor"]["document_info"] = dict( path=user_input.get( - "document_path", + "file_path", ) or "document.txt", metadata=user_input.get("document_metadata"), diff --git a/src/neo4j_graphrag/experimental/pipeline/kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/kg_builder.py index 531331ff4..b7313b3b0 100644 --- a/src/neo4j_graphrag/experimental/pipeline/kg_builder.py +++ b/src/neo4j_graphrag/experimental/pipeline/kg_builder.py @@ -148,16 +148,14 @@ async def run_async( self, file_path: Optional[str] = None, text: Optional[str] = None, - document_path: Optional[str] = None, document_metadata: Optional[dict[str, Any]] = None, ) -> PipelineResult: """ Asynchronously runs the knowledge graph building process. Args: - file_path (Optional[str]): The path to the PDF file to process. Required if `from_pdf` is True. + file_path (Optional[str]): The path to the PDF file to process. Required if `from_pdf` is True. If `from_pdf` is False, can be used to set the Document node path property. text (Optional[str]): The text content to process. Required if `from_pdf` is False. - document_path (Optional[str]): The path to the document to process. Required if `from_pdf` is True. document_metadata (Optional[dict[str, Any]]): The metadata to attach to the document. Returns: @@ -167,7 +165,6 @@ async def run_async( { "file_path": file_path, "text": text, - "document_path": document_path, "document_metadata": document_metadata, } ) diff --git a/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py b/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py index d8fd01f8f..916f9395f 100644 --- a/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py +++ b/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py @@ -322,16 +322,6 @@ def test_simple_kg_pipeline_config_run_params_no_file_no_text() -> None: ) -def test_simple_kg_pipeline_config_run_params_both_file_and_text() -> None: - config = SimpleKGPipelineConfig(from_pdf=False) - with pytest.raises(PipelineDefinitionError) as excinfo: - config.get_run_params({"text": "my text", "file_path": "my file"}) - assert ( - "Use either 'text' (when from_pdf=False) or 'file_path' (when from_pdf=True) argument." - in str(excinfo) - ) - - def test_simple_kg_pipeline_config_process_schema_with_precedence_legacy() -> None: entities: list[EntityInputType] = [ "Person", diff --git a/tests/unit/experimental/pipeline/test_kg_builder.py b/tests/unit/experimental/pipeline/test_kg_builder.py index d476d9d96..8634a1e4e 100644 --- a/tests/unit/experimental/pipeline/test_kg_builder.py +++ b/tests/unit/experimental/pipeline/test_kg_builder.py @@ -90,7 +90,7 @@ async def test_knowledge_graph_builder_document_info_with_text(_: Mock) -> None: ) as mock_run: await kg_builder.run_async( text=text_input, - document_path="my_document.txt", + file_path="my_document.txt", document_metadata={"source": "google drive"}, ) From a740640f8ab16427fc5c8c545a00c1b4993f538b Mon Sep 17 00:00:00 2001 From: estelle Date: Sun, 14 Sep 2025 10:25:21 +0200 Subject: [PATCH 7/8] Address comments --- .../pipeline/config/template_pipeline/simple_kg_builder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py index fef907d87..41ac7557f 100644 --- a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py +++ b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py @@ -340,9 +340,9 @@ def get_run_params(self, user_input: dict[str, Any]) -> dict[str, Any]: text = user_input.get("text") file_path = user_input.get("file_path") if text is None and file_path is None: - # use must provide either text or file_path or both + # user must provide either text or file_path or both raise PipelineDefinitionError( - "Use either 'text' (when from_pdf=False) or 'file_path' (when from_pdf=True) argument." + "At least one of `text` (when from_pdf=False) or 'file_path' (when from_pdf=True) argument must be provided." ) run_params: dict[str, dict[str, Any]] = defaultdict(dict) if self.lexical_graph_config: From 36d9a035ab803cb93a46416c0c4c5fc9cdc17f84 Mon Sep 17 00:00:00 2001 From: estelle Date: Sun, 14 Sep 2025 13:43:01 +0200 Subject: [PATCH 8/8] Fix CI --- .../config/template_pipeline/simple_kg_builder.py | 2 +- .../template_pipeline/test_simple_kg_builder.py | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py index 41ac7557f..929111898 100644 --- a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py +++ b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py @@ -342,7 +342,7 @@ def get_run_params(self, user_input: dict[str, Any]) -> dict[str, Any]: if text is None and file_path is None: # user must provide either text or file_path or both raise PipelineDefinitionError( - "At least one of `text` (when from_pdf=False) or 'file_path' (when from_pdf=True) argument must be provided." + "At least one of `text` (when from_pdf=False) or `file_path` (when from_pdf=True) argument must be provided." ) run_params: dict[str, dict[str, Any]] = defaultdict(dict) if self.lexical_graph_config: diff --git a/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py b/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py index 916f9395f..2c8d2a0fd 100644 --- a/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py +++ b/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import re from unittest.mock import Mock, patch import neo4j @@ -314,12 +315,13 @@ def test_simple_kg_pipeline_config_run_params_from_text_file_path() -> None: def test_simple_kg_pipeline_config_run_params_no_file_no_text() -> None: config = SimpleKGPipelineConfig(from_pdf=False) - with pytest.raises(PipelineDefinitionError) as excinfo: + with pytest.raises( + PipelineDefinitionError, + match=re.escape( + "At least one of `text` (when from_pdf=False) or `file_path` (when from_pdf=True) argument must be provided." + ), + ): config.get_run_params({}) - assert ( - "Use either 'text' (when from_pdf=False) or 'file_path' (when from_pdf=True) argument." - in str(excinfo) - ) def test_simple_kg_pipeline_config_process_schema_with_precedence_legacy() -> None: