diff --git a/core/src/test/java/apoc/load/XmlTest.java b/core/src/test/java/apoc/load/XmlTest.java index 625860ac68..d9db827af9 100644 --- a/core/src/test/java/apoc/load/XmlTest.java +++ b/core/src/test/java/apoc/load/XmlTest.java @@ -11,6 +11,7 @@ import org.junit.Test; import org.junit.rules.ExpectedException; import org.neo4j.graphdb.QueryExecutionException; +import org.neo4j.graphdb.ResourceIterator; import org.neo4j.internal.helpers.collection.Iterables; import org.neo4j.internal.helpers.collection.Iterators; import org.neo4j.test.rule.DbmsRule; @@ -56,6 +57,25 @@ public void testLoadXml() { }); } + @Test + public void testLoadXmlAsStream() { + testResult(db, "CALL apoc.load.xml('file:databases.xml', '/parent/child')", // YIELD value RETURN value + (res) -> { + final ResourceIterator> value = res.columnAs("value"); + final Map expectedFirstRow = Map.of("_type", "child", "name", "Neo4j", "_text", "Neo4j is a graph database"); + final Map expectedSecondRow = Map.of("_type", "child", "name", "relational", "_children", + List.of( + Map.of("_type", "grandchild", "name", "MySQL", "_text", "MySQL is a database & relational"), + Map.of("_type", "grandchild", "name", "Postgres", "_text", "Postgres is a relational database") + )); + Map next = value.next(); + assertEquals(expectedFirstRow, next); + next = value.next(); + assertEquals(expectedSecondRow, next); + assertFalse(value.hasNext()); + }); + } + @Test public void testMixedContent() { testCall(db, "CALL apoc.load.xml('" + TestUtil.getUrlFileName("xml/mixedcontent.xml") + "')", // YIELD value RETURN value diff --git a/docs/asciidoc/modules/ROOT/partials/usage/apoc.load.xml.adoc b/docs/asciidoc/modules/ROOT/partials/usage/apoc.load.xml.adoc index 3ce15e8933..c68bfb49ca 100644 --- a/docs/asciidoc/modules/ROOT/partials/usage/apoc.load.xml.adoc +++ b/docs/asciidoc/modules/ROOT/partials/usage/apoc.load.xml.adoc @@ -277,6 +277,118 @@ RETURN author; | "Ralls, Kim" |=== + +=== Avoid OOM using Xpath + +Generally, to avoid Heap Space Errors, +to handle large files you should always try to return the result as a stream, and not as a unique result, to avoid `java.lang.OutOfMemoryError: Java heap space`, if possible +For example, with a file like this: +.book.xml +[largeFile,xml] +---- + + + + + + + + + + + :MovieThe MatrixWelcome to the Real World1999 + :Person1964Keanu Reeves + :Person1967Carrie-Anne Moss + :Person1961Laurence Fishburne + :Person1960Hugo Weaving + :Person1967Lilly Wachowski + :Person1965Lana Wachowski + // a lot of other node tags... + + ACTED_IN["Morpheus"] + ACTED_IN["Agent Smith"] + // a lot of other edge tags... + + foo2 + foo3 + // ... + + + +---- + +you can extract all the children of the `graph` tag via: + +[source,cypher] +---- +CALL apoc.load.xml('databases.xml', '/graphml/graph/*', {}) +YIELD value RETURN value ORDER BY value.id +---- + +.Results +[options="header"] +|=== +| value +| {"_children":[{"_type":"data","_text":"ACTED_IN","key":"label"},{"_type":"data","_text":"["Morpheus"]","key":"roles"}],"_type":"edge","id":"e17","label":"ACTED_IN","source":"n3","target":"n10"} +| {"_children":[{"_type":"data","_text":"ACTED_IN","key":"label"},{"_type":"data","_text":"["Agent Smith"]","key":"roles"}],"_type":"edge","id":"e18","label":"ACTED_IN","source":"n4","target":"n10"} +| {"_type":"foo","id":"id2","_text":"foo2"} +| {"_type":"foo","id":"id3","_text":"foo3"} +| {"_children":[{"_type":"data","_text":":Movie","key":"labels"},{"_type":"data","_text":"The Matrix","key":"title"},{"_type":"data","_text":"Welcome to the Real World","key":"tagline"},{"_type":"data","_text":"1999","key":"released"}],"_type":"node","id":"n0","labels":":Movie"} +| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1964","key":"born"},{"_type":"data","_text":"Keanu Reeves","key":"name"}],"_type":"node","id":"n1","labels":":Person"} +| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1967","key":"born"},{"_type":"data","_text":"Carrie-Anne Moss","key":"name"}],"_type":"node","id":"n2","labels":":Person"} +| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1961","key":"born"},{"_type":"data","_text":"Laurence Fishburne","key":"name"}],"_type":"node","id":"n3","labels":":Person"} +| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1960","key":"born"},{"_type":"data","_text":"Hugo Weaving","key":"name"}],"_type":"node","id":"n4","labels":":Person"} +| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1967","key":"born"},{"_type":"data","_text":"Lilly Wachowski","key":"name"}],"_type":"node","id":"n5","labels":":Person"} +| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1965","key":"born"},{"_type":"data","_text":"Lana Wachowski","key":"name"}],"_type":"node","id":"n6","labels":":Person"} +|=== + + +Or if you want to include only `node` tag: + +[source,cypher] +---- +CALL apoc.load.xml('largeFile.xml', '/graphml/graph/node', {}) +YIELD value RETURN value ORDER BY value.id +---- + +.Results +[options="header"] +|=== +| value +| {"_children":[{"_type":"data","_text":":Movie","key":"labels"},{"_type":"data","_text":"The Matrix","key":"title"},{"_type":"data","_text":"Welcome to the Real World","key":"tagline"},{"_type":"data","_text":"1999","key":"released"}],"_type":"node","id":"n0","labels":":Movie"} +| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1964","key":"born"},{"_type":"data","_text":"Keanu Reeves","key":"name"}],"_type":"node","id":"n1","labels":":Person"} +| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1967","key":"born"},{"_type":"data","_text":"Carrie-Anne Moss","key":"name"}],"_type":"node","id":"n2","labels":":Person"} +| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1961","key":"born"},{"_type":"data","_text":"Laurence Fishburne","key":"name"}],"_type":"node","id":"n3","labels":":Person"} +| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1960","key":"born"},{"_type":"data","_text":"Hugo Weaving","key":"name"}],"_type":"node","id":"n4","labels":":Person"} +| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1967","key":"born"},{"_type":"data","_text":"Lilly Wachowski","key":"name"}],"_type":"node","id":"n5","labels":":Person"} +| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1965","key":"born"},{"_type":"data","_text":"Lana Wachowski","key":"name"}],"_type":"node","id":"n6","labels":":Person"} +|=== + +You can also include multiple tag names with `or`, e.g.: + +[source,cypher] +---- +CALL apoc.load.xml('largeFile.xml', 'graphml/graph/*[self::node or self::edge]', {}) +YIELD value RETURN value ORDER BY value.id +---- + +.Results +[options="header"] +|=== +| value +| {"_children":[{"_type":"data","_text":"ACTED_IN","key":"label"},{"_type":"data","_text":"["Morpheus"]","key":"roles"}],"_type":"edge","id":"e17","label":"ACTED_IN","source":"n3","target":"n10"} +| {"_children":[{"_type":"data","_text":"ACTED_IN","key":"label"},{"_type":"data","_text":"["Agent Smith"]","key":"roles"}],"_type":"edge","id":"e18","label":"ACTED_IN","source":"n4","target":"n10"} +| {"_children":[{"_type":"data","_text":":Movie","key":"labels"},{"_type":"data","_text":"The Matrix","key":"title"},{"_type":"data","_text":"Welcome to the Real World","key":"tagline"},{"_type":"data","_text":"1999","key":"released"}],"_type":"node","id":"n0","labels":":Movie"} +| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1964","key":"born"},{"_type":"data","_text":"Keanu Reeves","key":"name"}],"_type":"node","id":"n1","labels":":Person"} +| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1967","key":"born"},{"_type":"data","_text":"Carrie-Anne Moss","key":"name"}],"_type":"node","id":"n2","labels":":Person"} +| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1961","key":"born"},{"_type":"data","_text":"Laurence Fishburne","key":"name"}],"_type":"node","id":"n3","labels":":Person"} +| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1960","key":"born"},{"_type":"data","_text":"Hugo Weaving","key":"name"}],"_type":"node","id":"n4","labels":":Person"} +| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1967","key":"born"},{"_type":"data","_text":"Lilly Wachowski","key":"name"}],"_type":"node","id":"n5","labels":":Person"} +| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1965","key":"born"},{"_type":"data","_text":"Lana Wachowski","key":"name"}],"_type":"node","id":"n6","labels":":Person"} +|=== + +See https://docs.oracle.com/javase/7/docs/api/javax/xml/xpath/XPath.html[Java Xpath Doc] and https://www.w3schools.com/xml/xpath_intro.asp[w3School tutorial] for more examples and details. + [[load-xml-examples-extracting-datastructures]] === Extracting data structures