From 9db89a4a657193777e55b312407ee5cbbb88907b Mon Sep 17 00:00:00 2001
From: Mark Needham <mark.needham@neotechnology.com>
Date: Sun, 30 Sep 2018 17:39:23 +0100
Subject: [PATCH 1/6] wip: Overlap similarity - need to figure out how it
 should work for the topK variant

---
 .../similarity/CategoricalInput.java          |  17 +
 .../graphalgo/similarity/CosineProc.java      |   2 +-
 .../graphalgo/similarity/EuclideanProc.java   |   2 +-
 .../graphalgo/similarity/JaccardProc.java     |   2 +-
 .../graphalgo/similarity/OverlapProc.java     |  70 ++++
 .../graphalgo/similarity/SimilarityProc.java  |   4 +-
 .../algo/similarity/OverlapTest.java          | 337 ++++++++++++++++++
 7 files changed, 429 insertions(+), 5 deletions(-)
 create mode 100644 algo/src/main/java/org/neo4j/graphalgo/similarity/OverlapProc.java
 create mode 100644 tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java

diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/CategoricalInput.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/CategoricalInput.java
index 761f019e9..bc5ec742d 100644
--- a/algo/src/main/java/org/neo4j/graphalgo/similarity/CategoricalInput.java
+++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/CategoricalInput.java
@@ -26,4 +26,21 @@ SimilarityResult jaccard(double similarityCutoff, CategoricalInput e2) {
         if (jaccard < similarityCutoff) return null;
         return new SimilarityResult(id, e2.id, count1, count2, intersection, jaccard);
     }
+
+    SimilarityResult overlap(double similarityCutoff, CategoricalInput e2) {
+        long intersection = Intersections.intersection3(targets, e2.targets);
+        if (similarityCutoff >= 0d && intersection == 0) return null;
+        int count1 = targets.length;
+        int count2 = e2.targets.length;
+        long denominator = Math.min(count1, count2);
+        double overlap = denominator == 0 ? 0 : (double)intersection / denominator;
+        if (overlap < similarityCutoff) return null;
+
+        if(count1 <= count2) {
+            return new SimilarityResult(id, e2.id, count1, count2, intersection, overlap);
+        } else {
+            return new SimilarityResult(e2.id, id, count2, count1, intersection, overlap);
+        }
+
+    }
 }
diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/CosineProc.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/CosineProc.java
index 67e11ef3e..1ff223de2 100644
--- a/algo/src/main/java/org/neo4j/graphalgo/similarity/CosineProc.java
+++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/CosineProc.java
@@ -81,7 +81,7 @@ public Stream<SimilaritySummaryResult> cosine(
 
 
         boolean write = configuration.isWriteFlag(false) && similarityCutoff > 0.0;
-        return writeAndAggregateResults(configuration, stream, inputs.length, write);
+        return writeAndAggregateResults(configuration, stream, inputs.length, write, "SIMILAR");
     }
 
 
diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/EuclideanProc.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/EuclideanProc.java
index 0a0c0a765..d845af1cf 100644
--- a/algo/src/main/java/org/neo4j/graphalgo/similarity/EuclideanProc.java
+++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/EuclideanProc.java
@@ -80,7 +80,7 @@ public Stream<SimilaritySummaryResult> euclidean(
                 .map(SimilarityResult::squareRooted);
 
         boolean write = configuration.isWriteFlag(false); //  && similarityCutoff != 0.0;
-        return writeAndAggregateResults(configuration, stream, inputs.length, write);
+        return writeAndAggregateResults(configuration, stream, inputs.length, write, "SIMILAR");
     }
 
 
diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/JaccardProc.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/JaccardProc.java
index 594d8fb6b..435bff9ba 100644
--- a/algo/src/main/java/org/neo4j/graphalgo/similarity/JaccardProc.java
+++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/JaccardProc.java
@@ -61,7 +61,7 @@ public Stream<SimilaritySummaryResult> jaccard(
         Stream<SimilarityResult> stream = topN(similarityStream(inputs, computer, configuration, similarityCutoff, getTopK(configuration)), getTopN(configuration));
 
         boolean write = configuration.isWriteFlag(false) && similarityCutoff > 0.0;
-        return writeAndAggregateResults(configuration, stream, inputs.length, write);
+        return writeAndAggregateResults(configuration, stream, inputs.length, write, "SIMILAR");
     }
 
 
diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/OverlapProc.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/OverlapProc.java
new file mode 100644
index 000000000..ee6075970
--- /dev/null
+++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/OverlapProc.java
@@ -0,0 +1,70 @@
+/**
+ * Copyright (c) 2017 "Neo4j, Inc." <http://neo4j.com>
+ *
+ * This file is part of Neo4j Graph Algorithms <http://github.com/neo4j-contrib/neo4j-graph-algorithms>.
+ *
+ * Neo4j Graph Algorithms is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.neo4j.graphalgo.similarity;
+
+import org.neo4j.graphalgo.core.ProcedureConfiguration;
+import org.neo4j.procedure.Description;
+import org.neo4j.procedure.Mode;
+import org.neo4j.procedure.Name;
+import org.neo4j.procedure.Procedure;
+
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Stream;
+
+public class OverlapProc extends SimilarityProc {
+
+    @Procedure(name = "algo.similarity.overlap.stream", mode = Mode.READ)
+    @Description("CALL algo.similarity.overlap.stream([{source:id, targets:[ids]}], {similarityCutoff:-1,degreeCutoff:0}) " +
+            "YIELD item1, item2, count1, count2, intersection, similarity - computes jaccard similarities")
+    public Stream<SimilarityResult> similarityStream(
+            @Name(value = "data", defaultValue = "null") List<Map<String,Object>> data,
+            @Name(value = "config", defaultValue = "{}") Map<String, Object> config) {
+
+        SimilarityComputer<CategoricalInput> computer = (s, t, cutoff) -> s.overlap(cutoff, t);
+
+        ProcedureConfiguration configuration = ProcedureConfiguration.create(config);
+
+        CategoricalInput[] inputs = prepareCategories(data, getDegreeCutoff(configuration));
+
+        return topN(similarityStream(inputs, computer, configuration, getSimilarityCutoff(configuration), getTopK(configuration)), getTopN(configuration));
+    }
+
+    @Procedure(name = "algo.similarity.overlap", mode = Mode.WRITE)
+    @Description("CALL algo.similarity.overlap([{source:id, targets:[ids]}], {similarityCutoff:-1,degreeCutoff:0}) " +
+            "YIELD p50, p75, p90, p99, p999, p100 - computes jaccard similarities")
+    public Stream<SimilaritySummaryResult> overlap(
+            @Name(value = "data", defaultValue = "null") List<Map<String, Object>> data,
+            @Name(value = "config", defaultValue = "{}") Map<String, Object> config) {
+
+        SimilarityComputer<CategoricalInput> computer = (s,t,cutoff) -> s.overlap(cutoff, t);
+
+        ProcedureConfiguration configuration = ProcedureConfiguration.create(config);
+
+        CategoricalInput[] inputs = prepareCategories(data, getDegreeCutoff(configuration));
+
+        double similarityCutoff = getSimilarityCutoff(configuration);
+        Stream<SimilarityResult> stream = topN(similarityStream(inputs, computer, configuration, similarityCutoff, getTopK(configuration)), getTopN(configuration));
+
+        boolean write = configuration.isWriteFlag(false) && similarityCutoff > 0.0;
+        return writeAndAggregateResults(configuration, stream, inputs.length, write, "NARROWER_THAN");
+    }
+
+
+}
diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java
index e73c7de8e..9c1126422 100644
--- a/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java
+++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java
@@ -66,8 +66,8 @@ Long getDegreeCutoff(ProcedureConfiguration configuration) {
         return configuration.get("degreeCutoff", 0L);
     }
 
-    Stream<SimilaritySummaryResult> writeAndAggregateResults(ProcedureConfiguration configuration, Stream<SimilarityResult> stream, int length, boolean write) {
-        String writeRelationshipType = configuration.get("writeRelationshipType", "SIMILAR");
+    Stream<SimilaritySummaryResult> writeAndAggregateResults(ProcedureConfiguration configuration, Stream<SimilarityResult> stream, int length, boolean write, String defaultWriteProperty) {
+        String writeRelationshipType = configuration.get("writeRelationshipType", defaultWriteProperty);
         String writeProperty = configuration.getWriteProperty("score");
 
         AtomicLong similarityPairs = new AtomicLong();
diff --git a/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java b/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java
new file mode 100644
index 000000000..d2d2b9ebe
--- /dev/null
+++ b/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java
@@ -0,0 +1,337 @@
+/**
+ * Copyright (c) 2017 "Neo4j, Inc." <http://neo4j.com>
+ *
+ * This file is part of Neo4j Graph Algorithms <http://github.com/neo4j-contrib/neo4j-graph-algorithms>.
+ *
+ * Neo4j Graph Algorithms is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.neo4j.graphalgo.algo.similarity;
+
+import org.junit.*;
+import org.neo4j.graphalgo.TestDatabaseCreator;
+import org.neo4j.graphalgo.similarity.OverlapProc;
+import org.neo4j.graphdb.Result;
+import org.neo4j.graphdb.Transaction;
+import org.neo4j.internal.kernel.api.exceptions.KernelException;
+import org.neo4j.kernel.impl.proc.Procedures;
+import org.neo4j.kernel.internal.GraphDatabaseAPI;
+
+import java.util.Map;
+
+import static java.util.Collections.singletonMap;
+import static org.junit.Assert.*;
+import static org.neo4j.helpers.collection.MapUtil.map;
+
+public class OverlapTest {
+
+    private static GraphDatabaseAPI db;
+    private Transaction tx;
+    public static final String STATEMENT_STREAM = "MATCH (p:Person)-[:LIKES]->(i:Item) \n" +
+            "WITH {item:id(p), categories: collect(distinct id(i))} as userData\n" +
+            "WITH collect(userData) as data\n" +
+            "call algo.similarity.overlap.stream(data,$config) " +
+            "yield item1, item2, count1, count2, intersection, similarity " +
+            "RETURN item1, item2, count1, count2, intersection, similarity " +
+            "ORDER BY item1,item2";
+
+    public static final String STATEMENT = "MATCH (p:Person)-[:LIKES]->(i:Item) \n" +
+            "WITH {item:id(p), categories: collect(distinct id(i))} as userData\n" +
+            "WITH collect(userData) as data\n" +
+            "CALL algo.similarity.overlap(data, $config) " +
+            "yield p25, p50, p75, p90, p95, p99, p999, p100, nodes, similarityPairs " +
+            "RETURN p25, p50, p75, p90, p95, p99, p999, p100, nodes, similarityPairs";
+
+    public static final String STORE_EMBEDDING_STATEMENT = "MATCH (p:Person)-[:LIKES]->(i:Item) \n" +
+            "WITH p, collect(distinct id(i)) as userData\n" +
+            "SET p.embedding = userData";
+
+    public static final String EMBEDDING_STATEMENT = "MATCH (p:Person) \n" +
+            "WITH {item:id(p), categories: p.embedding} as userData\n" +
+            "WITH collect(userData) as data\n" +
+            "CALL algo.similarity.overlap(data, $config) " +
+            "yield p25, p50, p75, p90, p95, p99, p999, p100, nodes, similarityPairs " +
+            "RETURN p25, p50, p75, p90, p95, p99, p999, p100, nodes, similarityPairs";
+
+    @BeforeClass
+    public static void beforeClass() throws KernelException {
+        db = TestDatabaseCreator.createTestDatabase();
+        db.getDependencyResolver().resolveDependency(Procedures.class).registerProcedure(OverlapProc.class);
+        db.execute(buildDatabaseQuery()).close();
+    }
+
+    @AfterClass
+    public static void AfterClass() {
+        db.shutdown();
+    }
+
+    @Before
+    public void setUp() throws Exception {
+        tx = db.beginTx();
+    }
+
+    @After
+    public void tearDown() throws Exception {
+        tx.close();
+    }
+
+    private static void buildRandomDB(int size) {
+        db.execute("MATCH (n) DETACH DELETE n").close();
+        db.execute("UNWIND range(1,$size/10) as _ CREATE (:Person) CREATE (:Item) ",singletonMap("size",size)).close();
+        String statement =
+                "MATCH (p:Person) WITH collect(p) as people " +
+                "MATCH (i:Item) WITH people, collect(i) as items " +
+                "UNWIND range(1,$size) as _ " +
+                "WITH people[toInteger(rand()*size(people))] as p, items[toInteger(rand()*size(items))] as i " +
+                "MERGE (p)-[:LIKES]->(i) RETURN count(*) ";
+        db.execute(statement,singletonMap("size",size)).close();
+    }
+
+    private static String buildDatabaseQuery() {
+        return  "CREATE (a:Person {name:'Alice'})\n" +
+                "CREATE (b:Person {name:'Bob'})\n" +
+                "CREATE (c:Person {name:'Charlie'})\n" +
+                "CREATE (d:Person {name:'Dana'})\n" +
+                "CREATE (i1:Item {name:'p1'})\n" +
+                "CREATE (i2:Item {name:'p2'})\n" +
+                "CREATE (i3:Item {name:'p3'})\n" +
+
+                "CREATE" +
+                " (a)-[:LIKES]->(i1),\n" +
+                " (a)-[:LIKES]->(i2),\n" +
+                " (a)-[:LIKES]->(i3),\n" +
+                " (b)-[:LIKES]->(i1),\n" +
+                " (b)-[:LIKES]->(i2),\n" +
+                " (c)-[:LIKES]->(i3)\n";
+        // a: 3
+        // b: 2
+        // c: 1
+        // a / b = 2 : 2/3
+        // a / c = 1 : 1/3
+        // b / c = 0 : 0/3 = 0
+    }
+
+
+    @Test
+    public void overlapSingleMultiThreadComparision() {
+        int size = 333;
+        buildRandomDB(size);
+        Result result1 = db.execute(STATEMENT_STREAM, map("config", map("similarityCutoff",-0.1,"concurrency", 1)));
+        Result result2 = db.execute(STATEMENT_STREAM, map("config", map("similarityCutoff",-0.1,"concurrency", 2)));
+        Result result4 = db.execute(STATEMENT_STREAM, map("config", map("similarityCutoff",-0.1,"concurrency", 4)));
+        Result result8 = db.execute(STATEMENT_STREAM, map("config", map("similarityCutoff",-0.1,"concurrency", 8)));
+        int count=0;
+        while (result1.hasNext()) {
+            Map<String, Object> row1 = result1.next();
+            assertEquals(row1.toString(), row1,result2.next());
+            assertEquals(row1.toString(), row1,result4.next());
+            assertEquals(row1.toString(), row1,result8.next());
+            count++;
+        }
+        int people = size/10;
+        assertEquals((people * people - people)/2,count);
+    }
+
+    @Test
+    public void overlapSingleMultiThreadComparisionTopK() {
+        int size = 333;
+        buildRandomDB(size);
+
+        Result result1 = db.execute(STATEMENT_STREAM, map("config", map("similarityCutoff",-0.1,"topK",1,"concurrency", 1)));
+        Result result2 = db.execute(STATEMENT_STREAM, map("config", map("similarityCutoff",-0.1,"topK",1,"concurrency", 2)));
+        Result result4 = db.execute(STATEMENT_STREAM, map("config", map("similarityCutoff",-0.1,"topK",1,"concurrency", 4)));
+        Result result8 = db.execute(STATEMENT_STREAM, map("config", map("similarityCutoff",-0.1,"topK",1,"concurrency", 8)));
+        int count=0;
+        while (result1.hasNext()) {
+            Map<String, Object> row1 = result1.next();
+            assertEquals(row1.toString(), row1,result2.next());
+            assertEquals(row1.toString(), row1,result4.next());
+            assertEquals(row1.toString(), row1,result8.next());
+            count++;
+        }
+        int people = size/10;
+        assertEquals(people,count);
+    }
+
+    @Test
+    public void topNoverlapStreamTest() {
+        Result results = db.execute(STATEMENT_STREAM, map("config",map("top",2)));
+        assert01(results.next());
+        assert02(results.next());
+        assertFalse(results.hasNext());
+    }
+
+    @Test
+    public void overlapStreamTest() {
+        Result results = db.execute(STATEMENT_STREAM, map("config",map("concurrency",1)));
+
+        assertTrue(results.hasNext());
+        assert01(results.next());
+        assert02(results.next());
+        assert12(results.next());
+        assertFalse(results.hasNext());
+    }
+
+    @Test
+    public void topKoverlapStreamTest() {
+        Map<String, Object> params = map("config", map( "concurrency", 1,"topK", 1));
+        System.out.println(db.execute(STATEMENT_STREAM, params).resultAsString());
+
+        Result results = db.execute(STATEMENT_STREAM, params);
+        assertTrue(results.hasNext());
+        assert01(results.next());
+        assert01(flip(results.next()));
+        assert02(flip(results.next()));
+        assertFalse(results.hasNext());
+    }
+
+    private Map<String, Object> flip(Map<String, Object> row) {
+        return map("similarity", row.get("similarity"),"intersection", row.get("intersection"),
+                "item1",row.get("item2"),"count1",row.get("count2"),
+                "item2",row.get("item1"),"count2",row.get("count1"));
+    }
+
+    private void assertSameSource(Result results, int count, long source) {
+        Map<String, Object> row;
+        long target = 0;
+        for (int i = 0; i<count; i++) {
+            if (target == source) target++;
+            assertTrue(results.hasNext());
+            row = results.next();
+            assertEquals(source, row.get("item1"));
+            assertEquals(target, row.get("item2"));
+            target++;
+        }
+    }
+
+
+    @Test
+    public void topK4overlapStreamTest() {
+        Map<String, Object> params = map("config", map("topK", 4, "concurrency", 4, "similarityCutoff", -0.1));
+        System.out.println(db.execute(STATEMENT_STREAM,params).resultAsString());
+
+        Result results = db.execute(STATEMENT_STREAM,params);
+        assertSameSource(results, 2, 0L);
+        assertSameSource(results, 2, 1L);
+        assertSameSource(results, 2, 2L);
+        assertFalse(results.hasNext());
+    }
+
+    @Test
+    public void topK3overlapStreamTest() {
+        Map<String, Object> params = map("config", map("concurrency", 3, "topK", 3));
+
+        System.out.println(db.execute(STATEMENT_STREAM, params).resultAsString());
+
+        Result results = db.execute(STATEMENT_STREAM, params);
+        assertSameSource(results, 2, 0L);
+        assertSameSource(results, 2, 1L);
+        assertSameSource(results, 2, 2L);
+        assertFalse(results.hasNext());
+    }
+
+    @Test
+    public void simpleoverlapTest() {
+        Map<String, Object> params = map("config", map("similarityCutoff", 0.0));
+
+        Map<String, Object> row = db.execute(STATEMENT,params).next();
+        assertEquals((double) row.get("p25"), 1.0, 0.01);
+        assertEquals((double) row.get("p50"), 1.0, 0.01);
+        assertEquals((double) row.get("p75"), 1.0, 0.01);
+        assertEquals((double) row.get("p95"), 1.0, 0.01);
+        assertEquals((double) row.get("p99"), 1.0, 0.01);
+        assertEquals((double) row.get("p100"), 1.0, 0.01);
+    }
+
+    @Test
+    public void simpleoverlapFromEmbeddingTest() {
+        db.execute(STORE_EMBEDDING_STATEMENT);
+
+        Map<String, Object> params = map("config", map("similarityCutoff", 0.0));
+
+        Map<String, Object> row = db.execute(EMBEDDING_STATEMENT,params).next();
+        System.out.println("row = " + row);
+        assertEquals((double) row.get("p25"), 1.0, 0.01);
+        assertEquals((double) row.get("p50"), 1.0, 0.01);
+        assertEquals((double) row.get("p75"), 1.0, 0.01);
+        assertEquals((double) row.get("p95"), 1.0, 0.01);
+        assertEquals((double) row.get("p99"), 1.0, 0.01);
+        assertEquals((double) row.get("p100"), 1.0, 0.01);
+    }
+
+    /*
+    Alice       [p1,p2,p3]
+    Bob         [p1,p2]
+    Charlie     [p3]
+    Dana        []
+     */
+
+    @Test
+    public void simpleoverlapWriteTest() {
+        Map<String, Object> params = map("config", map( "write",true, "similarityCutoff", 0.1));
+
+        db.execute(STATEMENT,params).close();
+
+        String checkSimilaritiesQuery = "MATCH (a)-[similar:NARROWER_THAN]->(b)" +
+                "RETURN a.name AS node1, b.name as node2, similar.score AS score " +
+                "ORDER BY id(a), id(b)";
+
+        System.out.println(db.execute(checkSimilaritiesQuery).resultAsString());
+        Result result = db.execute(checkSimilaritiesQuery);
+
+        assertTrue(result.hasNext());
+        Map<String, Object> row = result.next();
+        assertEquals(row.get("node1"), "Bob");
+        assertEquals(row.get("node2"), "Alice");
+        assertEquals((double) row.get("score"), 1.0, 0.01);
+
+        assertTrue(result.hasNext());
+        row = result.next();
+        assertEquals(row.get("node1"), "Charlie");
+        assertEquals(row.get("node2"), "Alice");
+        assertEquals((double) row.get("score"), 1.0, 0.01);
+
+        assertFalse(result.hasNext());
+    }
+
+    private void assert12(Map<String, Object> row) {
+        assertEquals(2L, row.get("item1"));
+        assertEquals(1L, row.get("item2"));
+        assertEquals(1L, row.get("count1"));
+        assertEquals(2L, row.get("count2"));
+        // assertEquals(0L, row.get("intersection"));
+        assertEquals(0d, row.get("similarity"));
+    }
+
+    // a / b = 2 : 2/3
+    // a / c = 1 : 1/3
+    // b / c = 0 : 0/3 = 0
+
+    private void assert02(Map<String, Object> row) {
+        assertEquals(2L, row.get("item1"));
+        assertEquals(0L, row.get("item2"));
+        assertEquals(1L, row.get("count1"));
+        assertEquals(3L, row.get("count2"));
+        // assertEquals(1L, row.get("intersection"));
+        assertEquals(1d/1d, row.get("similarity"));
+    }
+
+    private void assert01(Map<String, Object> row) {
+        assertEquals(1L, row.get("item1"));
+        assertEquals(0L, row.get("item2"));
+        assertEquals(2L, row.get("count1"));
+        assertEquals(3L, row.get("count2"));
+        // assertEquals(2L, row.get("intersection"));
+        assertEquals(2d/2d, row.get("similarity"));
+    }
+}

From 075f0bd2b82c247b737a8aa2b72fa836607a817b Mon Sep 17 00:00:00 2001
From: Mark Needham <mark.needham@neotechnology.com>
Date: Mon, 1 Oct 2018 11:36:15 +0100
Subject: [PATCH 2/6] handle returning the smaller set first for topK

---
 .../similarity/CategoricalInput.java          |  4 +-
 .../graphalgo/similarity/SimilarityProc.java  | 60 +++++++++++--------
 .../similarity/SimilarityResult.java          | 11 +++-
 .../algo/similarity/OverlapTest.java          | 16 ++---
 4 files changed, 54 insertions(+), 37 deletions(-)

diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/CategoricalInput.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/CategoricalInput.java
index bc5ec742d..f0bddf8c0 100644
--- a/algo/src/main/java/org/neo4j/graphalgo/similarity/CategoricalInput.java
+++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/CategoricalInput.java
@@ -37,9 +37,9 @@ SimilarityResult overlap(double similarityCutoff, CategoricalInput e2) {
         if (overlap < similarityCutoff) return null;
 
         if(count1 <= count2) {
-            return new SimilarityResult(id, e2.id, count1, count2, intersection, overlap);
+            return new SimilarityResult(id, e2.id, count1, count2, intersection, overlap, false, false);
         } else {
-            return new SimilarityResult(e2.id, id, count2, count1, intersection, overlap);
+            return new SimilarityResult(e2.id, id, count2, count1, intersection, overlap, false, true);
         }
 
     }
diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java
index 9c1126422..237c18643 100644
--- a/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java
+++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java
@@ -51,7 +51,7 @@ static Stream<SimilarityResult> topN(Stream<SimilarityResult> stream, int topN)
         if (topN > 10000) {
             return stream.sorted(comparator).limit(topN);
         }
-        return topK(stream,topN, comparator);
+        return topK(stream, topN, comparator);
     }
 
     private static <T> void put(BlockingQueue<T> queue, T items) {
@@ -77,7 +77,7 @@ Stream<SimilaritySummaryResult> writeAndAggregateResults(ProcedureConfiguration
             similarityPairs.getAndIncrement();
         };
 
-        if(write) {
+        if (write) {
             SimilarityExporter similarityExporter = new SimilarityExporter(api, writeRelationshipType, writeProperty);
             similarityExporter.export(stream.peek(recorder));
         } else {
@@ -114,17 +114,15 @@ <T> Stream<SimilarityResult> similarityStream(T[] inputs, SimilarityComputer<T>
     private <T> Stream<SimilarityResult> similarityStream(T[] inputs, int length, double similiarityCutoff, SimilarityComputer<T> computer) {
         return IntStream.range(0, length)
                 .boxed().flatMap(sourceId -> IntStream.range(sourceId + 1, length)
-                        .mapToObj(targetId -> computer.similarity(inputs[sourceId],inputs[targetId],similiarityCutoff)).filter(Objects::nonNull));
+                        .mapToObj(targetId -> computer.similarity(inputs[sourceId], inputs[targetId], similiarityCutoff)).filter(Objects::nonNull));
     }
 
     private <T> Stream<SimilarityResult> similarityStreamTopK(T[] inputs, int length, double cutoff, int topK, SimilarityComputer<T> computer) {
         TopKConsumer<SimilarityResult>[] topKHolder = initializeTopKConsumers(length, topK);
 
-        for (int sourceId = 0;sourceId < length;sourceId++) {
-            computeSimilarityForSourceIndex(sourceId, inputs, length, cutoff, (sourceIndex, targetIndex, similarityResult) -> {
-                topKHolder[sourceIndex].accept(similarityResult);
-                topKHolder[targetIndex].accept(similarityResult.reverse());
-            }, computer);
+        SimilarityConsumer consumer = assignSimilarityPairs(topKHolder);
+        for (int sourceId = 0; sourceId < length; sourceId++) {
+            computeSimilarityForSourceIndex(sourceId, inputs, length, cutoff, consumer, computer);
         }
         return Arrays.stream(topKHolder).flatMap(TopKConsumer::stream);
     }
@@ -176,13 +174,13 @@ private <T> Stream<SimilarityResult> similarityParallelStreamTopK(T[] inputs, in
         ParallelUtil.runWithConcurrency(concurrency, tasks, terminationFlag, Pools.DEFAULT);
 
         TopKConsumer<SimilarityResult>[] topKConsumers = initializeTopKConsumers(length, topK);
-        for (Runnable task : tasks) ((TopKTask)task).mergeInto(topKConsumers);
+        for (Runnable task : tasks) ((TopKTask) task).mergeInto(topKConsumers);
         return Arrays.stream(topKConsumers).flatMap(TopKConsumer::stream);
     }
 
     private <T> void computeSimilarityForSourceIndex(int sourceId, T[] inputs, int length, double cutoff, SimilarityConsumer consumer, SimilarityComputer<T> computer) {
-        for (int targetId=sourceId+1;targetId<length;targetId++) {
-            SimilarityResult similarity = computer.similarity(inputs[sourceId], inputs[targetId],cutoff);
+        for (int targetId = sourceId + 1; targetId < length; targetId++) {
+            SimilarityResult similarity = computer.similarity(inputs[sourceId], inputs[targetId], cutoff);
             if (similarity != null) {
                 consumer.accept(sourceId, targetId, similarity);
             }
@@ -195,11 +193,11 @@ CategoricalInput[] prepareCategories(List<Map<String, Object>> data, long degree
         for (Map<String, Object> row : data) {
             List<Number> targetIds = extractValues(row.get("categories"));
             int size = targetIds.size();
-            if ( size > degreeCutoff) {
+            if (size > degreeCutoff) {
                 long[] targets = new long[size];
-                int i=0;
+                int i = 0;
                 for (Number id : targetIds) {
-                    targets[i++]=id.longValue();
+                    targets[i++] = id.longValue();
                 }
                 Arrays.sort(targets);
                 ids[idx++] = new CategoricalInput((Long) row.get("item"), targets);
@@ -218,11 +216,11 @@ WeightedInput[] prepareWeights(List<Map<String, Object>> data, long degreeCutoff
             List<Number> weightList = extractValues(row.get("weights"));
 
             int size = weightList.size();
-            if ( size > degreeCutoff) {
+            if (size > degreeCutoff) {
                 double[] weights = new double[size];
-                int i=0;
+                int i = 0;
                 for (Number value : weightList) {
-                    weights[i++]=value.doubleValue();
+                    weights[i++] = value.doubleValue();
                 }
                 inputs[idx++] = new WeightedInput((Long) row.get("item"), weights);
             }
@@ -233,7 +231,7 @@ WeightedInput[] prepareWeights(List<Map<String, Object>> data, long degreeCutoff
     }
 
     private List<Number> extractValues(Object rawValues) {
-        if(rawValues == null) {
+        if (rawValues == null) {
             return Collections.emptyList();
         }
 
@@ -259,13 +257,24 @@ protected int getTopK(ProcedureConfiguration configuration) {
     }
 
     protected int getTopN(ProcedureConfiguration configuration) {
-        return configuration.getInt("top",0);
+        return configuration.getInt("top", 0);
     }
 
     interface SimilarityComputer<T> {
         SimilarityResult similarity(T source, T target, double cutoff);
     }
 
+    public static SimilarityConsumer assignSimilarityPairs(TopKConsumer<SimilarityResult>[] topKConsumers) {
+        return (s, t, result) -> {
+            topKConsumers[result.reversed ? t : s].accept(result);
+
+            if (result.bidirectional) {
+                SimilarityResult reverse = result.reverse();
+                topKConsumers[reverse.reversed ? t : s].accept(reverse);
+            }
+        };
+    }
+
     private class TopKTask<T> implements Runnable {
         private final int batchSize;
         private final int taskOffset;
@@ -273,10 +282,10 @@ private class TopKTask<T> implements Runnable {
         private final int length;
         private final T[] ids;
         private final double similiarityCutoff;
-        private final SimilarityComputer computer;
+        private final SimilarityComputer<T> computer;
         private final TopKConsumer<SimilarityResult>[] topKConsumers;
 
-        TopKTask(int batchSize, int taskOffset, int multiplier, int length, T[] ids, double similiarityCutoff, int topK, SimilarityComputer computer) {
+        TopKTask(int batchSize, int taskOffset, int multiplier, int length, T[] ids, double similiarityCutoff, int topK, SimilarityComputer<T> computer) {
             this.batchSize = batchSize;
             this.taskOffset = taskOffset;
             this.multiplier = multiplier;
@@ -289,16 +298,17 @@ private class TopKTask<T> implements Runnable {
 
         @Override
         public void run() {
+            SimilarityConsumer consumer = assignSimilarityPairs(topKConsumers);
             for (int offset = 0; offset < batchSize; offset++) {
                 int sourceId = taskOffset * multiplier + offset;
                 if (sourceId < length) {
-                    computeSimilarityForSourceIndex(sourceId, ids, length, similiarityCutoff, (s, t, result) -> {
-                        topKConsumers[s].accept(result);
-                        topKConsumers[t].accept(result.reverse());
-                    }, computer);
+
+                    computeSimilarityForSourceIndex(sourceId, ids, length, similiarityCutoff, consumer, computer);
                 }
             }
         }
+
+
         void mergeInto(TopKConsumer<SimilarityResult>[] target) {
             for (int i = 0; i < target.length; i++) {
                 target[i].accept(topKConsumers[i]);
diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityResult.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityResult.java
index 02d9441df..ead39a726 100644
--- a/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityResult.java
+++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityResult.java
@@ -30,16 +30,23 @@ public class SimilarityResult implements Comparable<SimilarityResult> {
     public final long count2;
     public final long intersection;
     public double similarity;
+    public final boolean bidirectional;
+    public final boolean reversed;
 
     public static SimilarityResult TOMB = new SimilarityResult(-1, -1, -1, -1, -1, -1);
 
-    public SimilarityResult(long item1, long item2, long count1, long count2, long intersection, double similarity) {
+    public SimilarityResult(long item1, long item2, long count1, long count2, long intersection, double similarity, boolean bidirectional, boolean reversed) {
         this.item1 = item1;
         this.item2 = item2;
         this.count1 = count1;
         this.count2 = count2;
         this.intersection = intersection;
         this.similarity = similarity;
+        this.bidirectional = bidirectional;
+        this.reversed = reversed;
+    }
+    public SimilarityResult(long item1, long item2, long count1, long count2, long intersection, double similarity) {
+        this(item1,item2, count1,count2,intersection,similarity, true, false);
     }
 
     @Override
@@ -70,7 +77,7 @@ public int compareTo(SimilarityResult o) {
     }
 
     public SimilarityResult reverse() {
-        return new SimilarityResult(item2, item1,count2,count1,intersection,similarity);
+        return new SimilarityResult(item2, item1,count2,count1,intersection,similarity,bidirectional,!reversed);
     }
 
     public SimilarityResult squareRooted() {
diff --git a/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java b/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java
index d2d2b9ebe..ead065d98 100644
--- a/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java
+++ b/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java
@@ -159,8 +159,9 @@ public void overlapSingleMultiThreadComparisionTopK() {
             assertEquals(row1.toString(), row1,result8.next());
             count++;
         }
-        int people = size/10;
-        assertEquals(people,count);
+        assertFalse(result2.hasNext());
+        assertFalse(result4.hasNext());
+        assertFalse(result8.hasNext());
     }
 
     @Test
@@ -190,8 +191,7 @@ public void topKoverlapStreamTest() {
         Result results = db.execute(STATEMENT_STREAM, params);
         assertTrue(results.hasNext());
         assert01(results.next());
-        assert01(flip(results.next()));
-        assert02(flip(results.next()));
+        assert02(results.next());
         assertFalse(results.hasNext());
     }
 
@@ -221,8 +221,8 @@ public void topK4overlapStreamTest() {
         System.out.println(db.execute(STATEMENT_STREAM,params).resultAsString());
 
         Result results = db.execute(STATEMENT_STREAM,params);
-        assertSameSource(results, 2, 0L);
-        assertSameSource(results, 2, 1L);
+        assertSameSource(results, 0, 0L);
+        assertSameSource(results, 1, 1L);
         assertSameSource(results, 2, 2L);
         assertFalse(results.hasNext());
     }
@@ -234,8 +234,8 @@ public void topK3overlapStreamTest() {
         System.out.println(db.execute(STATEMENT_STREAM, params).resultAsString());
 
         Result results = db.execute(STATEMENT_STREAM, params);
-        assertSameSource(results, 2, 0L);
-        assertSameSource(results, 2, 1L);
+        assertSameSource(results, 0, 0L);
+        assertSameSource(results, 1, 1L);
         assertSameSource(results, 2, 2L);
         assertFalse(results.hasNext());
     }

From 899c44d8acf5f43d2caa5c0662635358d051311b Mon Sep 17 00:00:00 2001
From: Mark Needham <mark.needham@neotechnology.com>
Date: Mon, 1 Oct 2018 12:53:20 +0100
Subject: [PATCH 3/6] overlap docs

---
 .../scripts/similarity-overlap.cypher         |  76 ++++++
 doc/asciidoc/similarity-overlap.adoc          | 248 ++++++++++++++++++
 2 files changed, 324 insertions(+)
 create mode 100644 doc/asciidoc/scripts/similarity-overlap.cypher
 create mode 100644 doc/asciidoc/similarity-overlap.adoc

diff --git a/doc/asciidoc/scripts/similarity-overlap.cypher b/doc/asciidoc/scripts/similarity-overlap.cypher
new file mode 100644
index 000000000..1d1bd5546
--- /dev/null
+++ b/doc/asciidoc/scripts/similarity-overlap.cypher
@@ -0,0 +1,76 @@
+// tag::function[]
+RETURN algo.similarity.overlap([1,2,3], [1,2,4,5]) AS similarity
+// end::function[]
+
+// tag::create-sample-graph[]
+
+MERGE (fahrenheit451:Book {title:'Fahrenheit 451'})
+MERGE (dune:Book {title:'dune'})
+MERGE (hungerGames:Book {title:'The Hunger Games'})
+MERGE (nineteen84:Book {title:'1984'})
+
+MERGE (scienceFiction:Genre {name: "Science Fiction"})
+MERGE (fantasy:Genre {name: "Fantasy"})
+MERGE (dystopia:Genre {name: "Dystopia"})
+
+MERGE (fahrenheit451)-[:HAS_GENRE]->(dystopia)
+MERGE (fahrenheit451)-[:HAS_GENRE]->(scienceFiction)
+MERGE (fahrenheit451)-[:HAS_GENRE]->(fantasy)
+
+MERGE (hungerGames)-[:HAS_GENRE]->(scienceFiction)
+MERGE (hungerGames)-[:HAS_GENRE]->(fantasy)
+MERGE (hungerGames)-[:HAS_GENRE]->(romance)
+
+MERGE (nineteen84)-[:HAS_GENRE]->(scienceFiction)
+MERGE (nineteen84)-[:HAS_GENRE]->(dystopia)
+
+MERGE (dune)-[:HAS_GENRE]->(scienceFiction)
+MERGE (dune)-[:HAS_GENRE]->(fantasy)
+
+// end::create-sample-graph[]
+
+// tag::stream[]
+MATCH (p:Person)-[:LIKES]->(cuisine)
+WITH {item:id(p), categories: collect(id(cuisine))} as userData
+WITH collect(userData) as data
+CALL algo.similarity.jaccard.stream(data)
+YIELD item1, item2, count1, count2, intersection, similarity
+RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to, intersection, similarity
+ORDER BY similarity DESC
+// end::stream[]
+
+// tag::stream-similarity-cutoff[]
+MATCH (p:Person)-[:LIKES]->(cuisine)
+WITH {item:id(p), categories: collect(id(cuisine))} as userData
+WITH collect(userData) as data
+CALL algo.similarity.jaccard.stream(data, {similarityCutoff: 0.0})
+YIELD item1, item2, count1, count2, intersection, similarity
+RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to, intersection, similarity
+ORDER BY similarity DESC
+// end::stream-similarity-cutoff[]
+
+// tag::stream-topk[]
+MATCH (p:Person)-[:LIKES]->(cuisine)
+WITH {item:id(p), categories: collect(id(cuisine))} as userData
+WITH collect(userData) as data
+CALL algo.similarity.jaccard.stream(data, {topK: 1, similarityCutoff: 0.0})
+YIELD item1, item2, count1, count2, intersection, similarity
+RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to, similarity
+ORDER BY from
+// end::stream-topk[]
+
+// tag::write-back[]
+MATCH (p:Person)-[:LIKES]->(cuisine)
+WITH {item:id(p), categories: collect(id(cuisine))} as userData
+WITH collect(userData) as data
+CALL algo.similarity.jaccard(data, {topK: 1, similarityCutoff: 0.1, write:true})
+YIELD nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, stdDev, p25, p50, p75, p90, p95, p99, p999, p100
+RETURN nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, p95
+// end::write-back[]
+
+// tag::query[]
+MATCH (p:Person {name: "Praveena"})-[:SIMILAR]->(other),
+      (other)-[:LIKES]->(cuisine)
+WHERE not((p)-[:LIKES]->(cuisine))
+RETURN cuisine.name AS cuisine
+// end::query[]
diff --git a/doc/asciidoc/similarity-overlap.adoc b/doc/asciidoc/similarity-overlap.adoc
new file mode 100644
index 000000000..187f80841
--- /dev/null
+++ b/doc/asciidoc/similarity-overlap.adoc
@@ -0,0 +1,248 @@
+[[algorithms-similarity-overlap]]
+= The Overlap Similarity algorithm
+
+[abstract]
+--
+This section describes the Overlap Similarity algorithm in the Neo4j Graph Algorithms library.
+--
+
+// tag::introduction[]
+link:https://en.wikipedia.org/wiki/Overlap_coefficient[Overlap similarity] measures overlap between two sets.
+It is defined as the size of the intersection of two sets divided by the size of the smaller of the two sets
+// end::introduction[]
+
+
+[[algorithms-similarity-overlap-context]]
+== History and explanation
+
+// tag::explanation[]
+
+Overlap similarity is computed using the following formula:
+
+image::jaccard.png[role="middle"]
+
+// This is the raw information for this image:
+// ```
+// O(A,B) = ∣A ∩ B∣ / min(∣A|,|B|)
+// ```
+
+The library contains both procedures and functions to calculate similarity between sets of data.
+The function is best used when calculating the similarity between small numbers of sets.
+The procedures parallelize the computation and are therefore a better bet when computing similarities on bigger datasets.
+
+// end::explanation[]
+
+[[algorithms-similarity-overlap-usecase]]
+== Use-cases - when to use the Overlap Similarity algorithm
+
+// tag::use-case[]
+We can use the Overlap Similarity algorithm to work out the similarity between two things.
+We might then use the computed similarity as part of a recommendation query.
+For example, you can use the Overlap Similarity algorithm to show the products that were purchased by similar customers, in terms of previous products purchased.
+// end::use-case[]
+
+
+[[algorithms-similarity-overlap-sample]]
+== Overlap Similarity algorithm sample
+
+.The following will return the Overlap similarity of two lists of numbers:
+[source, cypher]
+----
+include::scripts/similarity-overlap.cypher[tag=function]
+----
+
+// tag::function[]
+.Results
+[opts="header",cols="1"]
+|===
+| `similarity`
+| 0.4
+|===
+// end::function[]
+
+// tag::function-explanation[]
+These two lists of numbers have a Overlap similarity of 0.4.
+We can see how this result is derived by breaking down the formula:
+
+```
+J(A,B) = ∣A ∩ B∣ / ∣A∣ + ∣B∣ - ∣A ∩ B|
+J(A,B) = 2 / 3 + 4 - 2
+       = 2 / 5
+       = 0.4
+```
+// end::function-explanation[]
+
+.The following will create a sample graph:
+[source, cypher]
+----
+include::scripts/similarity-overlap.cypher[tag=create-sample-graph]
+----
+
+.The following will return a stream of node pairs along with their intersection and Overlap similarities:
+[source, cypher]
+----
+include::scripts/similarity-overlap.cypher[tag=stream]
+----
+
+// tag::stream[]
+.Results
+[opts="header",cols="1,1,1,1"]
+|===
+| From     | To       | Intersection | Similarity
+| Arya     | Karin    | 2            | 0.66
+| Zhen     | Michael  | 2            | 0.66
+| Zhen     | Praveena | 1            | 0.33
+| Michael  | Karin    | 1            | 0.25
+| Praveena | Michael  | 1            | 0.25
+| Praveena | Arya     | 1            | 0.25
+| Michael  | Arya     | 1            | 0.2
+| Praveena | Karin    | 0            | 0
+| Zhen     | Arya     | 0            | 0
+| Zhen     | Karin    | 0            | 0
+|===
+// end::stream[]
+
+Arya and Karin, and Zhen and Michael have the most similar food preferences, with two overlapping cuisines for a similarity of 0.66.
+We also have 3 pairs of users who are not similar at all.
+We'd probably want to filter those out, which we can do by passing in the `similarityCutoff` parameter.
+
+.The following will return a stream of node pairs that have a similarity of at least 0.1, along with their intersection and Overlap similarities:
+[source, cypher]
+----
+include::scripts/similarity-overlap.cypher[tag=stream-similarity-cutoff]
+----
+
+// tag::stream-similarity-cutoff[]
+.Results
+[opts="header",cols="1,1,1,1"]
+|===
+| `from`   | `to`     | `intersection` | `similarity`
+| Arya     | Karin    | 2              | 0.66
+| Zhen     | Michael  | 2              | 0.66
+| Zhen     | Praveena | 1              | 0.33
+| Michael  | Karin    | 1              | 0.25
+| Praveena | Michael  | 1              | 0.25
+| Praveena | Arya     | 1              | 0.25
+| Michael  | Arya     | 1              | 0.2
+|===
+// end::stream-similarity-cutoff[]
+
+We can see that those users with no similarity have been filtered out.
+If we're implementing a k-Nearest Neighbors type query we might instead want to find the most similar `k` users for a given user.
+We can do that by passing in the `topK` parameter.
+
+.The following will return a stream of users along with the most similar user to them (i.e. `k=1`):
+[source, cypher]
+----
+include::scripts/similarity-overlap.cypher[tag=stream-topk]
+----
+
+// tag::stream-topk[]
+.Results
+[opts="header",cols="1,1,1"]
+|===
+| `from`   | `to`     | `similarity`
+| Arya     | Karin    | 0.66
+| Karin    | Arya     | 0.66
+| Michael  | Zhen     | 0.66
+| Praveena | Zhen     | 0.33
+| Zhen     | Michael  | 0.66
+|===
+// end::stream-topk[]
+
+These results will not be symmetrical.
+For example, the person most similar to Praveena is Zhen, but the person most similar to Zhen is actually Michael.
+
+.Parameters
+[opts="header",cols="1,1,1,1,4"]
+|===
+| Name               | Type   | Default        | Optional | Description
+| `data`             | list   | null           | no       | A list of maps of the following structure: `{item: nodeId, categories: [nodeId, nodeId, nodeId]}`
+| `top`              | int    | 0              | yes      | The number of similar pairs to return. If `0`, it will return as many as it finds.
+| `topK`             | int    | 0              | yes      | The number of similar values to return per node. If `0`, it  will return as many as it finds.
+| `similarityCutoff` | int    | -1             | yes      | The threshold for Overlap similarity. Values below this will not be returned.
+| `degreeCutoff`     | int    | 0              | yes      | The threshold for the number of items in the `targets` list. If the list contains less than this amount, that node will be excluded from the calculation.
+| `concurrency`      | int    | available CPUs | yes      | The number of concurrent threads.
+|===
+
+.Results
+[opts="header",cols="1,1,6"]
+|===
+| Name           | Type | Description
+| `item1`        | int  | The ID of one node in the similarity pair.
+| `item2`        | int  | The ID of other node in the similarity pair.
+| `count1`       | int  | The size of the `targets` list of one node.
+| `count2`       | int  | The size of the `targets` list of other node.
+| `intersection` | int  | The number of intersecting values in the two nodes `targets` lists.
+| `similarity`   | int  | The Overlap similarity of the two nodes.
+|===
+
+.The following will find the most similar user for each user, and store a relationship between those users:
+[source, cypher]
+----
+include::scripts/similarity-overlap.cypher[tag=write-back]
+----
+
+// tag::write-back[]
+.Results
+[opts="header",cols="1,1,1,1,1,1,1,1,1"]
+|===
+| `nodes` | `similarityPairs` | `write` | `writeRelationshipType` | `writeProperty` | `min`  | `max`  | `mean` | `p95`
+| 5       | 5                 | true    | SIMILAR                 | score           | 0.33   | 0.66   | 0.59   | 0.66
+|===
+// end::write-back[]
+
+We then could write a query to find out what types of cuisine that other people similar to us might like.
+
+.The following will find the most similar user to Praveena, and return their favorite cuisines that Praveena doesn't (yet!) like:
+[source, cypher]
+----
+include::scripts/similarity-overlap.cypher[tag=query]
+----
+
+// tag::query[]
+.Results
+[opts="header",cols="1"]
+|===
+| `cuisine`
+| French
+|===
+// end::query[]
+
+.Parameters
+[opts="header",cols="1,1,1,1,4"]
+|===
+| Name                     | Type    | Default        | Optional | Description
+| `data`                   | list    | null           | no       | A list of maps of the following structure: `{item: nodeId, categories: [nodeId, nodeId, nodeId]}`
+| `top`                    | int     | 0              | yes      | The number of similar pairs to return. If `0`, it will return as many as it finds.
+| `topK`                   | int     | 0              | yes      | The number of similar values to return per node. If `0`, it will return as many as it finds.
+| `similarityCutoff`       | int     | -1             | yes      | The threshold for Overlap similarity. Values below this will not be returned.
+| `degreeCutoff`           | int     | 0              | yes      | The threshold for the number of items in the `targets` list. If the list contains less than this amount, that node will be excluded from the calculation.
+| `concurrency`            | int     | available CPUs | yes      | The number of concurrent threads.
+| `write`                  | boolean | false          | yes      | Indicates whether results should be stored.
+| `writeRelationshipType`  | string  | SIMILAR        | yes      | The relationship type to use when storing results.
+| `writeProperty`          | string  | score          | yes      | The property to use when storing results.
+|===
+
+.Results
+[opts="header",cols="1,1,6"]
+|===
+| Name                    | Type    | Description
+| `nodes`                 | int     | The number of nodes passed in.
+| `similarityPairs`       | int     | The number of pairs of similar nodes computed.
+| `write`                 | boolean | Indicates whether results were stored.
+| `writeRelationshipType` | string  | The relationship type used when storing results.
+| `writeProperty`         | string  | The property used when storing results.
+| `min`                   | double  | The minimum similarity score computed.
+| `max`                   | double  | The maximum similarity score computed.
+| `mean`                  | double  | The mean of similarities scores computed.
+| `stdDev`                | double  | The standard deviation of similarities scores computed.
+| `p25`                   | double  | The 25 percentile of similarities scores computed.
+| `p50`                   | double  | The 50 percentile of similarities scores computed.
+| `p75`                   | double  | The 75 percentile of similarities scores computed.
+| `p90`                   | double  | The 90 percentile of similarities scores computed.
+| `p95`                   | double  | The 95 percentile of similarities scores computed.
+| `p99`                   | double  | The 99 percentile of similarities scores computed.
+| `p999`                  | double  | The 99.9 percentile of similarities scores computed.
+| `p100`                  | double  | The 25 percentile of similarities scores computed.
+|===

From 328eff5ce117ef686628d7ab8f62141a4d17a079 Mon Sep 17 00:00:00 2001
From: Mark Needham <mark.needham@neotechnology.com>
Date: Mon, 1 Oct 2018 13:33:22 +0100
Subject: [PATCH 4/6] more examples

---
 .../scripts/similarity-overlap.cypher         | 33 +++++++-----
 doc/asciidoc/similarity-overlap.adoc          | 51 ++++++++-----------
 2 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/doc/asciidoc/scripts/similarity-overlap.cypher b/doc/asciidoc/scripts/similarity-overlap.cypher
index 1d1bd5546..ef7ad4842 100644
--- a/doc/asciidoc/scripts/similarity-overlap.cypher
+++ b/doc/asciidoc/scripts/similarity-overlap.cypher
@@ -5,17 +5,20 @@ RETURN algo.similarity.overlap([1,2,3], [1,2,4,5]) AS similarity
 // tag::create-sample-graph[]
 
 MERGE (fahrenheit451:Book {title:'Fahrenheit 451'})
-MERGE (dune:Book {title:'dune'})
+MERGE (dune:Book {title:'Dune'})
 MERGE (hungerGames:Book {title:'The Hunger Games'})
 MERGE (nineteen84:Book {title:'1984'})
+MERGE (gatsby:Book {title:'The Great Gatsby'})
 
 MERGE (scienceFiction:Genre {name: "Science Fiction"})
 MERGE (fantasy:Genre {name: "Fantasy"})
 MERGE (dystopia:Genre {name: "Dystopia"})
+MERGE (classics:Genre {name: "Classics"})
 
 MERGE (fahrenheit451)-[:HAS_GENRE]->(dystopia)
 MERGE (fahrenheit451)-[:HAS_GENRE]->(scienceFiction)
 MERGE (fahrenheit451)-[:HAS_GENRE]->(fantasy)
+MERGE (fahrenheit451)-[:HAS_GENRE]->(classics)
 
 MERGE (hungerGames)-[:HAS_GENRE]->(scienceFiction)
 MERGE (hungerGames)-[:HAS_GENRE]->(fantasy)
@@ -23,37 +26,43 @@ MERGE (hungerGames)-[:HAS_GENRE]->(romance)
 
 MERGE (nineteen84)-[:HAS_GENRE]->(scienceFiction)
 MERGE (nineteen84)-[:HAS_GENRE]->(dystopia)
+MERGE (nineteen84)-[:HAS_GENRE]->(classics)
 
 MERGE (dune)-[:HAS_GENRE]->(scienceFiction)
 MERGE (dune)-[:HAS_GENRE]->(fantasy)
+MERGE (dune)-[:HAS_GENRE]->(classics)
+
+MERGE (gatsby)-[:HAS_GENRE]->(classics)
 
 // end::create-sample-graph[]
 
 // tag::stream[]
-MATCH (p:Person)-[:LIKES]->(cuisine)
-WITH {item:id(p), categories: collect(id(cuisine))} as userData
+MATCH (book:Book)-[:HAS_GENRE]->(genre)
+WITH {item:id(genre), categories: collect(id(book))} as userData
 WITH collect(userData) as data
-CALL algo.similarity.jaccard.stream(data)
+CALL algo.similarity.overlap.stream(data)
 YIELD item1, item2, count1, count2, intersection, similarity
-RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to, intersection, similarity
+RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to,
+       count1, count2, intersection, similarity
 ORDER BY similarity DESC
 // end::stream[]
 
 // tag::stream-similarity-cutoff[]
-MATCH (p:Person)-[:LIKES]->(cuisine)
-WITH {item:id(p), categories: collect(id(cuisine))} as userData
+MATCH (book:Book)-[:HAS_GENRE]->(genre)
+WITH {item:id(genre), categories: collect(id(book))} as userData
 WITH collect(userData) as data
-CALL algo.similarity.jaccard.stream(data, {similarityCutoff: 0.0})
+CALL algo.similarity.overlap.stream(data, {similarityCutoff: 0.75})
 YIELD item1, item2, count1, count2, intersection, similarity
-RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to, intersection, similarity
+RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to,
+       count1, count2, intersection, similarity
 ORDER BY similarity DESC
 // end::stream-similarity-cutoff[]
 
 // tag::stream-topk[]
-MATCH (p:Person)-[:LIKES]->(cuisine)
-WITH {item:id(p), categories: collect(id(cuisine))} as userData
+MATCH (book:Book)-[:HAS_GENRE]->(genre)
+WITH {item:id(genre), categories: collect(id(book))} as userData
 WITH collect(userData) as data
-CALL algo.similarity.jaccard.stream(data, {topK: 1, similarityCutoff: 0.0})
+CALL algo.similarity.jaccard.stream(data, {topK: 1})
 YIELD item1, item2, count1, count2, intersection, similarity
 RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to, similarity
 ORDER BY from
diff --git a/doc/asciidoc/similarity-overlap.adoc b/doc/asciidoc/similarity-overlap.adoc
index 187f80841..72493cea1 100644
--- a/doc/asciidoc/similarity-overlap.adoc
+++ b/doc/asciidoc/similarity-overlap.adoc
@@ -86,27 +86,23 @@ include::scripts/similarity-overlap.cypher[tag=stream]
 
 // tag::stream[]
 .Results
-[opts="header",cols="1,1,1,1"]
-|===
-| From     | To       | Intersection | Similarity
-| Arya     | Karin    | 2            | 0.66
-| Zhen     | Michael  | 2            | 0.66
-| Zhen     | Praveena | 1            | 0.33
-| Michael  | Karin    | 1            | 0.25
-| Praveena | Michael  | 1            | 0.25
-| Praveena | Arya     | 1            | 0.25
-| Michael  | Arya     | 1            | 0.2
-| Praveena | Karin    | 0            | 0
-| Zhen     | Arya     | 0            | 0
-| Zhen     | Karin    | 0            | 0
+[opts="header"]
+|===
+| `from`          | `to`                | `count1` | `count2` | `intersection`   | `similarity`
+| Fantasy         | Science Fiction | 3      | 4      | 3            | 1.0
+| Dystopia        | Science Fiction | 2      | 4      | 2            | 1.0
+| Dystopia        | Classics        | 2      | 4      | 2            | 1.0
+| Science Fiction | Classics        | 4      | 4      | 3            | 0.75
+| Fantasy         | Classics        | 3      | 4      | 2            | 0.66
+| Dystopia        | Fantasy         | 2      | 3      | 1            | 0.5
 |===
 // end::stream[]
 
-Arya and Karin, and Zhen and Michael have the most similar food preferences, with two overlapping cuisines for a similarity of 0.66.
-We also have 3 pairs of users who are not similar at all.
-We'd probably want to filter those out, which we can do by passing in the `similarityCutoff` parameter.
+Fantasy and Dystopia are both clear sub genres of Science Fiction - 100% of the books that list those as genres also list Science Fiction as a genre.
+Dystopia is also a sub genre of Classics
+The others are less obvious - Dystopia probably isn't a sub genre of Fantasy, but the other two pairs could be sub genres.
 
-.The following will return a stream of node pairs that have a similarity of at least 0.1, along with their intersection and Overlap similarities:
+.The following will return a stream of node pairs that have a similarity of at least 0.75, along with their intersection and Overlap similarities:
 [source, cypher]
 ----
 include::scripts/similarity-overlap.cypher[tag=stream-similarity-cutoff]
@@ -114,24 +110,21 @@ include::scripts/similarity-overlap.cypher[tag=stream-similarity-cutoff]
 
 // tag::stream-similarity-cutoff[]
 .Results
-[opts="header",cols="1,1,1,1"]
+[opts="header"]
 |===
-| `from`   | `to`     | `intersection` | `similarity`
-| Arya     | Karin    | 2              | 0.66
-| Zhen     | Michael  | 2              | 0.66
-| Zhen     | Praveena | 1              | 0.33
-| Michael  | Karin    | 1              | 0.25
-| Praveena | Michael  | 1              | 0.25
-| Praveena | Arya     | 1              | 0.25
-| Michael  | Arya     | 1              | 0.2
+| `from`          | `to`                | `count1` | `count2` | `intersection`   | `similarity`
+| Fantasy         | Science Fiction | 3      | 4      | 3            | 1.0
+| Dystopia        | Science Fiction | 2      | 4      | 2            | 1.0
+| Dystopia        | Classics        | 2      | 4      | 2            | 1.0
+| Science Fiction | Classics        | 4      | 4      | 3            | 0.75
 |===
 // end::stream-similarity-cutoff[]
 
-We can see that those users with no similarity have been filtered out.
-If we're implementing a k-Nearest Neighbors type query we might instead want to find the most similar `k` users for a given user.
+We can see that those genres with lower similarity have been filtered out.
+If we're implementing a k-Nearest Neighbors type query we might instead want to find the most similar `k` super genres for a given genre.
 We can do that by passing in the `topK` parameter.
 
-.The following will return a stream of users along with the most similar user to them (i.e. `k=1`):
+.The following will return a stream of genres along with the most similar super category to them (i.e. `k=1`):
 [source, cypher]
 ----
 include::scripts/similarity-overlap.cypher[tag=stream-topk]

From 88d1d3c04f8d363858000aff952ae5f1817cb394 Mon Sep 17 00:00:00 2001
From: Mark Needham <mark.needham@neotechnology.com>
Date: Mon, 1 Oct 2018 13:53:59 +0100
Subject: [PATCH 5/6] more examples

---
 .../scripts/similarity-overlap.cypher         | 18 +++++-----
 doc/asciidoc/similarity-overlap.adoc          | 34 +++++++++----------
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/doc/asciidoc/scripts/similarity-overlap.cypher b/doc/asciidoc/scripts/similarity-overlap.cypher
index ef7ad4842..52cf0ed3d 100644
--- a/doc/asciidoc/scripts/similarity-overlap.cypher
+++ b/doc/asciidoc/scripts/similarity-overlap.cypher
@@ -62,24 +62,24 @@ ORDER BY similarity DESC
 MATCH (book:Book)-[:HAS_GENRE]->(genre)
 WITH {item:id(genre), categories: collect(id(book))} as userData
 WITH collect(userData) as data
-CALL algo.similarity.jaccard.stream(data, {topK: 1})
+CALL algo.similarity.overlap.stream(data, {topK: 2})
 YIELD item1, item2, count1, count2, intersection, similarity
-RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to, similarity
+RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to,
+       count1, count2, intersection, similarity
 ORDER BY from
 // end::stream-topk[]
 
 // tag::write-back[]
-MATCH (p:Person)-[:LIKES]->(cuisine)
-WITH {item:id(p), categories: collect(id(cuisine))} as userData
+MATCH (book:Book)-[:HAS_GENRE]->(genre)
+WITH {item:id(genre), categories: collect(id(book))} as userData
 WITH collect(userData) as data
-CALL algo.similarity.jaccard(data, {topK: 1, similarityCutoff: 0.1, write:true})
+CALL algo.similarity.overlap(data, {topK: 2, similarityCutoff: 0.5, write:true})
 YIELD nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, stdDev, p25, p50, p75, p90, p95, p99, p999, p100
 RETURN nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, p95
 // end::write-back[]
 
 // tag::query[]
-MATCH (p:Person {name: "Praveena"})-[:SIMILAR]->(other),
-      (other)-[:LIKES]->(cuisine)
-WHERE not((p)-[:LIKES]->(cuisine))
-RETURN cuisine.name AS cuisine
+MATCH path = (fantasy:Genre {name: "Fantasy"})-[:NARROWER_THAN*]->(genre)
+RETURN [node in nodes(path) | node.name] AS hierarchy
+ORDER BY length(path)
 // end::query[]
diff --git a/doc/asciidoc/similarity-overlap.adoc b/doc/asciidoc/similarity-overlap.adoc
index 72493cea1..ac6eb5cf6 100644
--- a/doc/asciidoc/similarity-overlap.adoc
+++ b/doc/asciidoc/similarity-overlap.adoc
@@ -124,7 +124,7 @@ We can see that those genres with lower similarity have been filtered out.
 If we're implementing a k-Nearest Neighbors type query we might instead want to find the most similar `k` super genres for a given genre.
 We can do that by passing in the `topK` parameter.
 
-.The following will return a stream of genres along with the most similar super category to them (i.e. `k=1`):
+.The following will return a stream of genres along with the two most similar super genres to them (i.e. `k=2`):
 [source, cypher]
 ----
 include::scripts/similarity-overlap.cypher[tag=stream-topk]
@@ -132,19 +132,17 @@ include::scripts/similarity-overlap.cypher[tag=stream-topk]
 
 // tag::stream-topk[]
 .Results
-[opts="header",cols="1,1,1"]
+[opts="header"]
 |===
-| `from`   | `to`     | `similarity`
-| Arya     | Karin    | 0.66
-| Karin    | Arya     | 0.66
-| Michael  | Zhen     | 0.66
-| Praveena | Zhen     | 0.33
-| Zhen     | Michael  | 0.66
+| `from`          | `to`                | `count1` | `count2` | `intersection`   | `similarity`
+| Dystopia        | Classics        | 2      | 4      | 2            | 1.0
+| Dystopia        | Science Fiction | 2      | 4      | 2            | 1.0
+| Fantasy         | Science Fiction | 3      | 4      | 3            | 1.0
+| Fantasy         | Classics        | 3      | 4      | 2            | 0.6666666666666666
+| Science Fiction | Classics        | 4      | 4      | 3            | 0.75
 |===
 // end::stream-topk[]
 
-These results will not be symmetrical.
-For example, the person most similar to Praveena is Zhen, but the person most similar to Zhen is actually Michael.
 
 .Parameters
 [opts="header",cols="1,1,1,1,4"]
@@ -178,16 +176,16 @@ include::scripts/similarity-overlap.cypher[tag=write-back]
 
 // tag::write-back[]
 .Results
-[opts="header",cols="1,1,1,1,1,1,1,1,1"]
+[opts="header"]
 |===
 | `nodes` | `similarityPairs` | `write` | `writeRelationshipType` | `writeProperty` | `min`  | `max`  | `mean` | `p95`
-| 5       | 5                 | true    | SIMILAR                 | score           | 0.33   | 0.66   | 0.59   | 0.66
+| 4     | 5               | TRUE  | NARROWER_THAN       | score       | 0.6666641235351562 | 1.0000038146972656 | 0.8833351135253906 | 1.0000038146972656
 |===
 // end::write-back[]
 
-We then could write a query to find out what types of cuisine that other people similar to us might like.
+We then could write a query to find out the genre hierarchy for a specific genre.
 
-.The following will find the most similar user to Praveena, and return their favorite cuisines that Praveena doesn't (yet!) like:
+.The following will find the genre hierarchy for the Fantasy genre
 [source, cypher]
 ----
 include::scripts/similarity-overlap.cypher[tag=query]
@@ -197,8 +195,10 @@ include::scripts/similarity-overlap.cypher[tag=query]
 .Results
 [opts="header",cols="1"]
 |===
-| `cuisine`
-| French
+| `hierarchy`
+| ["Fantasy", "Science Fiction"]
+| ["Fantasy", "Classics"]
+| ["Fantasy", "Science Fiction", "Classics"]
 |===
 // end::query[]
 
@@ -213,7 +213,7 @@ include::scripts/similarity-overlap.cypher[tag=query]
 | `degreeCutoff`           | int     | 0              | yes      | The threshold for the number of items in the `targets` list. If the list contains less than this amount, that node will be excluded from the calculation.
 | `concurrency`            | int     | available CPUs | yes      | The number of concurrent threads.
 | `write`                  | boolean | false          | yes      | Indicates whether results should be stored.
-| `writeRelationshipType`  | string  | SIMILAR        | yes      | The relationship type to use when storing results.
+| `writeRelationshipType`  | string  | NARROWER_THAN        | yes      | The relationship type to use when storing results.
 | `writeProperty`          | string  | score          | yes      | The property to use when storing results.
 |===
 

From 15a5c91933ebace154590ef6a9758d2c9131f4e2 Mon Sep 17 00:00:00 2001
From: Mark Needham <mark.needham@neotechnology.com>
Date: Tue, 2 Oct 2018 11:53:31 +0100
Subject: [PATCH 6/6] link overlap similarity

---
 doc/asciidoc/algorithms-similarity.adoc | 2 ++
 doc/docbook/content-map.xml             | 4 +++-
 readme.adoc                             | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/doc/asciidoc/algorithms-similarity.adoc b/doc/asciidoc/algorithms-similarity.adoc
index 842f7a1f5..dd6b1a996 100644
--- a/doc/asciidoc/algorithms-similarity.adoc
+++ b/doc/asciidoc/algorithms-similarity.adoc
@@ -13,7 +13,9 @@ These algorithms help calculate the similarity of nodes:
 * <<algorithms-similarity-jaccard, Jaccard Similarity>> (`algo.similarity.jaccard`)
 * <<algorithms-similarity-cosine, Cosine Similarity>> (`algo.similarity.cosine`)
 * <<algorithms-similarity-euclidean, Euclidean Distance>> (`algo.similarity.euclidean`)
+* <<algorithms-similarity-overlap, Overlap Similarity>> (`algo.similarity.overlap`)
 
 include::similarity-jaccard.adoc[leveloffset=2]
 include::similarity-cosine.adoc[leveloffset=2]
 include::similarity-euclidean.adoc[leveloffset=2]
+include::similarity-overlap.adoc[leveloffset=2]
diff --git a/doc/docbook/content-map.xml b/doc/docbook/content-map.xml
index 4e97ba77e..20205484b 100644
--- a/doc/docbook/content-map.xml
+++ b/doc/docbook/content-map.xml
@@ -56,13 +56,15 @@
       </d:tocentry>
       <d:tocentry linkend="algorithms-similarity-euclidean"><?dbhtml filename="algorithms/similarity-euclidean/index.html"?>
       </d:tocentry>
+      <d:tocentry linkend="algorithms-similarity-overlap"><?dbhtml filename="algorithms/similarity-overlap/index.html"?>
+      </d:tocentry>
     </d:tocentry>
 
     <d:tocentry linkend="algorithms-preprocessing"><?dbhtml filename="algorithms/preprocessing/index.html"?>
       <d:tocentry linkend="algorithms-one-hot-encoding"><?dbhtml filename="algorithms/one-hot-encoding/index.html"?>
       </d:tocentry>
     </d:tocentry>
-    
+
   </d:tocentry>
 </d:toc>
 <!-- vim: set ts=2 sw=2: -->
diff --git a/readme.adoc b/readme.adoc
index 7046336f9..fbf803fe6 100644
--- a/readme.adoc
+++ b/readme.adoc
@@ -69,6 +69,7 @@ These algorithms help calculate the similarity of nodes:
 * <<algorithms-similarity-jaccard, Jaccard Similarity>> (`algo.similarity.jaccard`)
 * <<algorithms-similarity-cosine, Cosine Similarity>> (`algo.similarity.cosine`)
 * <<algorithms-similarity-euclidean, Euclidean Distance>> (`algo.similarity.euclidean`)
+* <<algorithms-similarity-overlap, Overlap Similarity>> (`algo.similarity.overlap`)
 
 === Preprocessing