Skip to content

Commit

Permalink
Fixes #2723: apoc.load.xml on large file generates OoM Errors (#2841)
Browse files Browse the repository at this point in the history
  • Loading branch information
vga91 authored and nadja-muller committed May 23, 2022
1 parent 2ac45f4 commit 7a2cbda
Show file tree
Hide file tree
Showing 2 changed files with 132 additions and 0 deletions.
20 changes: 20 additions & 0 deletions core/src/test/java/apoc/load/XmlTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import org.junit.Test;
import org.junit.rules.ExpectedException;
import org.neo4j.graphdb.QueryExecutionException;
import org.neo4j.graphdb.ResourceIterator;
import org.neo4j.internal.helpers.collection.Iterables;
import org.neo4j.internal.helpers.collection.Iterators;
import org.neo4j.test.rule.DbmsRule;
Expand Down Expand Up @@ -56,6 +57,25 @@ public void testLoadXml() {
});
}

@Test
public void testLoadXmlAsStream() {
testResult(db, "CALL apoc.load.xml('file:databases.xml', '/parent/child')", // YIELD value RETURN value
(res) -> {
final ResourceIterator<Map<String, Object>> value = res.columnAs("value");
final Map<String, String> expectedFirstRow = Map.of("_type", "child", "name", "Neo4j", "_text", "Neo4j is a graph database");
final Map<String, Object> expectedSecondRow = Map.of("_type", "child", "name", "relational", "_children",
List.of(
Map.of("_type", "grandchild", "name", "MySQL", "_text", "MySQL is a database & relational"),
Map.of("_type", "grandchild", "name", "Postgres", "_text", "Postgres is a relational database")
));
Map<String, Object> next = value.next();
assertEquals(expectedFirstRow, next);
next = value.next();
assertEquals(expectedSecondRow, next);
assertFalse(value.hasNext());
});
}

@Test
public void testMixedContent() {
testCall(db, "CALL apoc.load.xml('" + TestUtil.getUrlFileName("xml/mixedcontent.xml") + "')", // YIELD value RETURN value
Expand Down
112 changes: 112 additions & 0 deletions docs/asciidoc/modules/ROOT/partials/usage/apoc.load.xml.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,118 @@ RETURN author;
| "Ralls, Kim"
|===


=== Avoid OOM using Xpath

Generally, to avoid Heap Space Errors,
to handle large files you should always try to return the result as a stream, and not as a unique result, to avoid `java.lang.OutOfMemoryError: Java heap space`, if possible
For example, with a file like this:
.book.xml
[largeFile,xml]
----
<?xml version="1.0" encoding="UTF-8"?>
<!-- <graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd"> -->
<graphml name="databases">
<key id="name" for="node" attr.name="name"/>
<key id="tagline" for="node" attr.name="tagline"/>
<key id="title" for="node" attr.name="title"/>
<key id="labels" for="node" attr.name="labels"/>
<key id="summary" for="edge" attr.name="summary"/>
<key id="label" for="edge" attr.name="label"/>
<graph id="G" edgedefault="directed">
<node id="n0" labels=":Movie"><data key="labels">:Movie</data><data key="title">The Matrix</data><data key="tagline">Welcome to the Real World</data><data key="released">1999</data></node>
<node id="n1" labels=":Person"><data key="labels">:Person</data><data key="born">1964</data><data key="name">Keanu Reeves</data></node>
<node id="n2" labels=":Person"><data key="labels">:Person</data><data key="born">1967</data><data key="name">Carrie-Anne Moss</data></node>
<node id="n3" labels=":Person"><data key="labels">:Person</data><data key="born">1961</data><data key="name">Laurence Fishburne</data></node>
<node id="n4" labels=":Person"><data key="labels">:Person</data><data key="born">1960</data><data key="name">Hugo Weaving</data></node>
<node id="n5" labels=":Person"><data key="labels">:Person</data><data key="born">1967</data><data key="name">Lilly Wachowski</data></node>
<node id="n6" labels=":Person"><data key="labels">:Person</data><data key="born">1965</data><data key="name">Lana Wachowski</data></node>
// a lot of other node tags...
<edge id="e17" source="n3" target="n10" label="ACTED_IN"><data key="label">ACTED_IN</data><data key="roles">["Morpheus"]</data></edge>
<edge id="e18" source="n4" target="n10" label="ACTED_IN"><data key="label">ACTED_IN</data><data key="roles">["Agent Smith"]</data></edge>
// a lot of other edge tags...
<foo id="id2">foo2</foo>
<foo id="id3">foo3</foo>
// ...
</graph>
</graphml>
----

you can extract all the children of the `graph` tag via:

[source,cypher]
----
CALL apoc.load.xml('databases.xml', '/graphml/graph/*', {})
YIELD value RETURN value ORDER BY value.id
----

.Results
[options="header"]
|===
| value
| {"_children":[{"_type":"data","_text":"ACTED_IN","key":"label"},{"_type":"data","_text":"["Morpheus"]","key":"roles"}],"_type":"edge","id":"e17","label":"ACTED_IN","source":"n3","target":"n10"}
| {"_children":[{"_type":"data","_text":"ACTED_IN","key":"label"},{"_type":"data","_text":"["Agent Smith"]","key":"roles"}],"_type":"edge","id":"e18","label":"ACTED_IN","source":"n4","target":"n10"}
| {"_type":"foo","id":"id2","_text":"foo2"}
| {"_type":"foo","id":"id3","_text":"foo3"}
| {"_children":[{"_type":"data","_text":":Movie","key":"labels"},{"_type":"data","_text":"The Matrix","key":"title"},{"_type":"data","_text":"Welcome to the Real World","key":"tagline"},{"_type":"data","_text":"1999","key":"released"}],"_type":"node","id":"n0","labels":":Movie"}
| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1964","key":"born"},{"_type":"data","_text":"Keanu Reeves","key":"name"}],"_type":"node","id":"n1","labels":":Person"}
| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1967","key":"born"},{"_type":"data","_text":"Carrie-Anne Moss","key":"name"}],"_type":"node","id":"n2","labels":":Person"}
| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1961","key":"born"},{"_type":"data","_text":"Laurence Fishburne","key":"name"}],"_type":"node","id":"n3","labels":":Person"}
| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1960","key":"born"},{"_type":"data","_text":"Hugo Weaving","key":"name"}],"_type":"node","id":"n4","labels":":Person"}
| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1967","key":"born"},{"_type":"data","_text":"Lilly Wachowski","key":"name"}],"_type":"node","id":"n5","labels":":Person"}
| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1965","key":"born"},{"_type":"data","_text":"Lana Wachowski","key":"name"}],"_type":"node","id":"n6","labels":":Person"}
|===


Or if you want to include only `node` tag:

[source,cypher]
----
CALL apoc.load.xml('largeFile.xml', '/graphml/graph/node', {})
YIELD value RETURN value ORDER BY value.id
----

.Results
[options="header"]
|===
| value
| {"_children":[{"_type":"data","_text":":Movie","key":"labels"},{"_type":"data","_text":"The Matrix","key":"title"},{"_type":"data","_text":"Welcome to the Real World","key":"tagline"},{"_type":"data","_text":"1999","key":"released"}],"_type":"node","id":"n0","labels":":Movie"}
| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1964","key":"born"},{"_type":"data","_text":"Keanu Reeves","key":"name"}],"_type":"node","id":"n1","labels":":Person"}
| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1967","key":"born"},{"_type":"data","_text":"Carrie-Anne Moss","key":"name"}],"_type":"node","id":"n2","labels":":Person"}
| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1961","key":"born"},{"_type":"data","_text":"Laurence Fishburne","key":"name"}],"_type":"node","id":"n3","labels":":Person"}
| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1960","key":"born"},{"_type":"data","_text":"Hugo Weaving","key":"name"}],"_type":"node","id":"n4","labels":":Person"}
| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1967","key":"born"},{"_type":"data","_text":"Lilly Wachowski","key":"name"}],"_type":"node","id":"n5","labels":":Person"}
| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1965","key":"born"},{"_type":"data","_text":"Lana Wachowski","key":"name"}],"_type":"node","id":"n6","labels":":Person"}
|===

You can also include multiple tag names with `or`, e.g.:

[source,cypher]
----
CALL apoc.load.xml('largeFile.xml', 'graphml/graph/*[self::node or self::edge]', {})
YIELD value RETURN value ORDER BY value.id
----

.Results
[options="header"]
|===
| value
| {"_children":[{"_type":"data","_text":"ACTED_IN","key":"label"},{"_type":"data","_text":"["Morpheus"]","key":"roles"}],"_type":"edge","id":"e17","label":"ACTED_IN","source":"n3","target":"n10"}
| {"_children":[{"_type":"data","_text":"ACTED_IN","key":"label"},{"_type":"data","_text":"["Agent Smith"]","key":"roles"}],"_type":"edge","id":"e18","label":"ACTED_IN","source":"n4","target":"n10"}
| {"_children":[{"_type":"data","_text":":Movie","key":"labels"},{"_type":"data","_text":"The Matrix","key":"title"},{"_type":"data","_text":"Welcome to the Real World","key":"tagline"},{"_type":"data","_text":"1999","key":"released"}],"_type":"node","id":"n0","labels":":Movie"}
| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1964","key":"born"},{"_type":"data","_text":"Keanu Reeves","key":"name"}],"_type":"node","id":"n1","labels":":Person"}
| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1967","key":"born"},{"_type":"data","_text":"Carrie-Anne Moss","key":"name"}],"_type":"node","id":"n2","labels":":Person"}
| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1961","key":"born"},{"_type":"data","_text":"Laurence Fishburne","key":"name"}],"_type":"node","id":"n3","labels":":Person"}
| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1960","key":"born"},{"_type":"data","_text":"Hugo Weaving","key":"name"}],"_type":"node","id":"n4","labels":":Person"}
| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1967","key":"born"},{"_type":"data","_text":"Lilly Wachowski","key":"name"}],"_type":"node","id":"n5","labels":":Person"}
| {"_children":[{"_type":"data","_text":":Person","key":"labels"},{"_type":"data","_text":"1965","key":"born"},{"_type":"data","_text":"Lana Wachowski","key":"name"}],"_type":"node","id":"n6","labels":":Person"}
|===

See https://docs.oracle.com/javase/7/docs/api/javax/xml/xpath/XPath.html[Java Xpath Doc] and https://www.w3schools.com/xml/xpath_intro.asp[w3School tutorial] for more examples and details.

[[load-xml-examples-extracting-datastructures]]
=== Extracting data structures

Expand Down

0 comments on commit 7a2cbda

Please sign in to comment.