From 9db89a4a657193777e55b312407ee5cbbb88907b Mon Sep 17 00:00:00 2001 From: Mark Needham Date: Sun, 30 Sep 2018 17:39:23 +0100 Subject: [PATCH 1/6] wip: Overlap similarity - need to figure out how it should work for the topK variant --- .../similarity/CategoricalInput.java | 17 + .../graphalgo/similarity/CosineProc.java | 2 +- .../graphalgo/similarity/EuclideanProc.java | 2 +- .../graphalgo/similarity/JaccardProc.java | 2 +- .../graphalgo/similarity/OverlapProc.java | 70 ++++ .../graphalgo/similarity/SimilarityProc.java | 4 +- .../algo/similarity/OverlapTest.java | 337 ++++++++++++++++++ 7 files changed, 429 insertions(+), 5 deletions(-) create mode 100644 algo/src/main/java/org/neo4j/graphalgo/similarity/OverlapProc.java create mode 100644 tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/CategoricalInput.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/CategoricalInput.java index 761f019e9..bc5ec742d 100644 --- a/algo/src/main/java/org/neo4j/graphalgo/similarity/CategoricalInput.java +++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/CategoricalInput.java @@ -26,4 +26,21 @@ SimilarityResult jaccard(double similarityCutoff, CategoricalInput e2) { if (jaccard < similarityCutoff) return null; return new SimilarityResult(id, e2.id, count1, count2, intersection, jaccard); } + + SimilarityResult overlap(double similarityCutoff, CategoricalInput e2) { + long intersection = Intersections.intersection3(targets, e2.targets); + if (similarityCutoff >= 0d && intersection == 0) return null; + int count1 = targets.length; + int count2 = e2.targets.length; + long denominator = Math.min(count1, count2); + double overlap = denominator == 0 ? 0 : (double)intersection / denominator; + if (overlap < similarityCutoff) return null; + + if(count1 <= count2) { + return new SimilarityResult(id, e2.id, count1, count2, intersection, overlap); + } else { + return new SimilarityResult(e2.id, id, count2, count1, intersection, overlap); + } + + } } diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/CosineProc.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/CosineProc.java index 67e11ef3e..1ff223de2 100644 --- a/algo/src/main/java/org/neo4j/graphalgo/similarity/CosineProc.java +++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/CosineProc.java @@ -81,7 +81,7 @@ public Stream cosine( boolean write = configuration.isWriteFlag(false) && similarityCutoff > 0.0; - return writeAndAggregateResults(configuration, stream, inputs.length, write); + return writeAndAggregateResults(configuration, stream, inputs.length, write, "SIMILAR"); } diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/EuclideanProc.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/EuclideanProc.java index 0a0c0a765..d845af1cf 100644 --- a/algo/src/main/java/org/neo4j/graphalgo/similarity/EuclideanProc.java +++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/EuclideanProc.java @@ -80,7 +80,7 @@ public Stream euclidean( .map(SimilarityResult::squareRooted); boolean write = configuration.isWriteFlag(false); // && similarityCutoff != 0.0; - return writeAndAggregateResults(configuration, stream, inputs.length, write); + return writeAndAggregateResults(configuration, stream, inputs.length, write, "SIMILAR"); } diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/JaccardProc.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/JaccardProc.java index 594d8fb6b..435bff9ba 100644 --- a/algo/src/main/java/org/neo4j/graphalgo/similarity/JaccardProc.java +++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/JaccardProc.java @@ -61,7 +61,7 @@ public Stream jaccard( Stream stream = topN(similarityStream(inputs, computer, configuration, similarityCutoff, getTopK(configuration)), getTopN(configuration)); boolean write = configuration.isWriteFlag(false) && similarityCutoff > 0.0; - return writeAndAggregateResults(configuration, stream, inputs.length, write); + return writeAndAggregateResults(configuration, stream, inputs.length, write, "SIMILAR"); } diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/OverlapProc.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/OverlapProc.java new file mode 100644 index 000000000..ee6075970 --- /dev/null +++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/OverlapProc.java @@ -0,0 +1,70 @@ +/** + * Copyright (c) 2017 "Neo4j, Inc." + * + * This file is part of Neo4j Graph Algorithms . + * + * Neo4j Graph Algorithms is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.neo4j.graphalgo.similarity; + +import org.neo4j.graphalgo.core.ProcedureConfiguration; +import org.neo4j.procedure.Description; +import org.neo4j.procedure.Mode; +import org.neo4j.procedure.Name; +import org.neo4j.procedure.Procedure; + +import java.util.List; +import java.util.Map; +import java.util.stream.Stream; + +public class OverlapProc extends SimilarityProc { + + @Procedure(name = "algo.similarity.overlap.stream", mode = Mode.READ) + @Description("CALL algo.similarity.overlap.stream([{source:id, targets:[ids]}], {similarityCutoff:-1,degreeCutoff:0}) " + + "YIELD item1, item2, count1, count2, intersection, similarity - computes jaccard similarities") + public Stream similarityStream( + @Name(value = "data", defaultValue = "null") List> data, + @Name(value = "config", defaultValue = "{}") Map config) { + + SimilarityComputer computer = (s, t, cutoff) -> s.overlap(cutoff, t); + + ProcedureConfiguration configuration = ProcedureConfiguration.create(config); + + CategoricalInput[] inputs = prepareCategories(data, getDegreeCutoff(configuration)); + + return topN(similarityStream(inputs, computer, configuration, getSimilarityCutoff(configuration), getTopK(configuration)), getTopN(configuration)); + } + + @Procedure(name = "algo.similarity.overlap", mode = Mode.WRITE) + @Description("CALL algo.similarity.overlap([{source:id, targets:[ids]}], {similarityCutoff:-1,degreeCutoff:0}) " + + "YIELD p50, p75, p90, p99, p999, p100 - computes jaccard similarities") + public Stream overlap( + @Name(value = "data", defaultValue = "null") List> data, + @Name(value = "config", defaultValue = "{}") Map config) { + + SimilarityComputer computer = (s,t,cutoff) -> s.overlap(cutoff, t); + + ProcedureConfiguration configuration = ProcedureConfiguration.create(config); + + CategoricalInput[] inputs = prepareCategories(data, getDegreeCutoff(configuration)); + + double similarityCutoff = getSimilarityCutoff(configuration); + Stream stream = topN(similarityStream(inputs, computer, configuration, similarityCutoff, getTopK(configuration)), getTopN(configuration)); + + boolean write = configuration.isWriteFlag(false) && similarityCutoff > 0.0; + return writeAndAggregateResults(configuration, stream, inputs.length, write, "NARROWER_THAN"); + } + + +} diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java index e73c7de8e..9c1126422 100644 --- a/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java +++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java @@ -66,8 +66,8 @@ Long getDegreeCutoff(ProcedureConfiguration configuration) { return configuration.get("degreeCutoff", 0L); } - Stream writeAndAggregateResults(ProcedureConfiguration configuration, Stream stream, int length, boolean write) { - String writeRelationshipType = configuration.get("writeRelationshipType", "SIMILAR"); + Stream writeAndAggregateResults(ProcedureConfiguration configuration, Stream stream, int length, boolean write, String defaultWriteProperty) { + String writeRelationshipType = configuration.get("writeRelationshipType", defaultWriteProperty); String writeProperty = configuration.getWriteProperty("score"); AtomicLong similarityPairs = new AtomicLong(); diff --git a/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java b/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java new file mode 100644 index 000000000..d2d2b9ebe --- /dev/null +++ b/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java @@ -0,0 +1,337 @@ +/** + * Copyright (c) 2017 "Neo4j, Inc." + * + * This file is part of Neo4j Graph Algorithms . + * + * Neo4j Graph Algorithms is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.neo4j.graphalgo.algo.similarity; + +import org.junit.*; +import org.neo4j.graphalgo.TestDatabaseCreator; +import org.neo4j.graphalgo.similarity.OverlapProc; +import org.neo4j.graphdb.Result; +import org.neo4j.graphdb.Transaction; +import org.neo4j.internal.kernel.api.exceptions.KernelException; +import org.neo4j.kernel.impl.proc.Procedures; +import org.neo4j.kernel.internal.GraphDatabaseAPI; + +import java.util.Map; + +import static java.util.Collections.singletonMap; +import static org.junit.Assert.*; +import static org.neo4j.helpers.collection.MapUtil.map; + +public class OverlapTest { + + private static GraphDatabaseAPI db; + private Transaction tx; + public static final String STATEMENT_STREAM = "MATCH (p:Person)-[:LIKES]->(i:Item) \n" + + "WITH {item:id(p), categories: collect(distinct id(i))} as userData\n" + + "WITH collect(userData) as data\n" + + "call algo.similarity.overlap.stream(data,$config) " + + "yield item1, item2, count1, count2, intersection, similarity " + + "RETURN item1, item2, count1, count2, intersection, similarity " + + "ORDER BY item1,item2"; + + public static final String STATEMENT = "MATCH (p:Person)-[:LIKES]->(i:Item) \n" + + "WITH {item:id(p), categories: collect(distinct id(i))} as userData\n" + + "WITH collect(userData) as data\n" + + "CALL algo.similarity.overlap(data, $config) " + + "yield p25, p50, p75, p90, p95, p99, p999, p100, nodes, similarityPairs " + + "RETURN p25, p50, p75, p90, p95, p99, p999, p100, nodes, similarityPairs"; + + public static final String STORE_EMBEDDING_STATEMENT = "MATCH (p:Person)-[:LIKES]->(i:Item) \n" + + "WITH p, collect(distinct id(i)) as userData\n" + + "SET p.embedding = userData"; + + public static final String EMBEDDING_STATEMENT = "MATCH (p:Person) \n" + + "WITH {item:id(p), categories: p.embedding} as userData\n" + + "WITH collect(userData) as data\n" + + "CALL algo.similarity.overlap(data, $config) " + + "yield p25, p50, p75, p90, p95, p99, p999, p100, nodes, similarityPairs " + + "RETURN p25, p50, p75, p90, p95, p99, p999, p100, nodes, similarityPairs"; + + @BeforeClass + public static void beforeClass() throws KernelException { + db = TestDatabaseCreator.createTestDatabase(); + db.getDependencyResolver().resolveDependency(Procedures.class).registerProcedure(OverlapProc.class); + db.execute(buildDatabaseQuery()).close(); + } + + @AfterClass + public static void AfterClass() { + db.shutdown(); + } + + @Before + public void setUp() throws Exception { + tx = db.beginTx(); + } + + @After + public void tearDown() throws Exception { + tx.close(); + } + + private static void buildRandomDB(int size) { + db.execute("MATCH (n) DETACH DELETE n").close(); + db.execute("UNWIND range(1,$size/10) as _ CREATE (:Person) CREATE (:Item) ",singletonMap("size",size)).close(); + String statement = + "MATCH (p:Person) WITH collect(p) as people " + + "MATCH (i:Item) WITH people, collect(i) as items " + + "UNWIND range(1,$size) as _ " + + "WITH people[toInteger(rand()*size(people))] as p, items[toInteger(rand()*size(items))] as i " + + "MERGE (p)-[:LIKES]->(i) RETURN count(*) "; + db.execute(statement,singletonMap("size",size)).close(); + } + + private static String buildDatabaseQuery() { + return "CREATE (a:Person {name:'Alice'})\n" + + "CREATE (b:Person {name:'Bob'})\n" + + "CREATE (c:Person {name:'Charlie'})\n" + + "CREATE (d:Person {name:'Dana'})\n" + + "CREATE (i1:Item {name:'p1'})\n" + + "CREATE (i2:Item {name:'p2'})\n" + + "CREATE (i3:Item {name:'p3'})\n" + + + "CREATE" + + " (a)-[:LIKES]->(i1),\n" + + " (a)-[:LIKES]->(i2),\n" + + " (a)-[:LIKES]->(i3),\n" + + " (b)-[:LIKES]->(i1),\n" + + " (b)-[:LIKES]->(i2),\n" + + " (c)-[:LIKES]->(i3)\n"; + // a: 3 + // b: 2 + // c: 1 + // a / b = 2 : 2/3 + // a / c = 1 : 1/3 + // b / c = 0 : 0/3 = 0 + } + + + @Test + public void overlapSingleMultiThreadComparision() { + int size = 333; + buildRandomDB(size); + Result result1 = db.execute(STATEMENT_STREAM, map("config", map("similarityCutoff",-0.1,"concurrency", 1))); + Result result2 = db.execute(STATEMENT_STREAM, map("config", map("similarityCutoff",-0.1,"concurrency", 2))); + Result result4 = db.execute(STATEMENT_STREAM, map("config", map("similarityCutoff",-0.1,"concurrency", 4))); + Result result8 = db.execute(STATEMENT_STREAM, map("config", map("similarityCutoff",-0.1,"concurrency", 8))); + int count=0; + while (result1.hasNext()) { + Map row1 = result1.next(); + assertEquals(row1.toString(), row1,result2.next()); + assertEquals(row1.toString(), row1,result4.next()); + assertEquals(row1.toString(), row1,result8.next()); + count++; + } + int people = size/10; + assertEquals((people * people - people)/2,count); + } + + @Test + public void overlapSingleMultiThreadComparisionTopK() { + int size = 333; + buildRandomDB(size); + + Result result1 = db.execute(STATEMENT_STREAM, map("config", map("similarityCutoff",-0.1,"topK",1,"concurrency", 1))); + Result result2 = db.execute(STATEMENT_STREAM, map("config", map("similarityCutoff",-0.1,"topK",1,"concurrency", 2))); + Result result4 = db.execute(STATEMENT_STREAM, map("config", map("similarityCutoff",-0.1,"topK",1,"concurrency", 4))); + Result result8 = db.execute(STATEMENT_STREAM, map("config", map("similarityCutoff",-0.1,"topK",1,"concurrency", 8))); + int count=0; + while (result1.hasNext()) { + Map row1 = result1.next(); + assertEquals(row1.toString(), row1,result2.next()); + assertEquals(row1.toString(), row1,result4.next()); + assertEquals(row1.toString(), row1,result8.next()); + count++; + } + int people = size/10; + assertEquals(people,count); + } + + @Test + public void topNoverlapStreamTest() { + Result results = db.execute(STATEMENT_STREAM, map("config",map("top",2))); + assert01(results.next()); + assert02(results.next()); + assertFalse(results.hasNext()); + } + + @Test + public void overlapStreamTest() { + Result results = db.execute(STATEMENT_STREAM, map("config",map("concurrency",1))); + + assertTrue(results.hasNext()); + assert01(results.next()); + assert02(results.next()); + assert12(results.next()); + assertFalse(results.hasNext()); + } + + @Test + public void topKoverlapStreamTest() { + Map params = map("config", map( "concurrency", 1,"topK", 1)); + System.out.println(db.execute(STATEMENT_STREAM, params).resultAsString()); + + Result results = db.execute(STATEMENT_STREAM, params); + assertTrue(results.hasNext()); + assert01(results.next()); + assert01(flip(results.next())); + assert02(flip(results.next())); + assertFalse(results.hasNext()); + } + + private Map flip(Map row) { + return map("similarity", row.get("similarity"),"intersection", row.get("intersection"), + "item1",row.get("item2"),"count1",row.get("count2"), + "item2",row.get("item1"),"count2",row.get("count1")); + } + + private void assertSameSource(Result results, int count, long source) { + Map row; + long target = 0; + for (int i = 0; i params = map("config", map("topK", 4, "concurrency", 4, "similarityCutoff", -0.1)); + System.out.println(db.execute(STATEMENT_STREAM,params).resultAsString()); + + Result results = db.execute(STATEMENT_STREAM,params); + assertSameSource(results, 2, 0L); + assertSameSource(results, 2, 1L); + assertSameSource(results, 2, 2L); + assertFalse(results.hasNext()); + } + + @Test + public void topK3overlapStreamTest() { + Map params = map("config", map("concurrency", 3, "topK", 3)); + + System.out.println(db.execute(STATEMENT_STREAM, params).resultAsString()); + + Result results = db.execute(STATEMENT_STREAM, params); + assertSameSource(results, 2, 0L); + assertSameSource(results, 2, 1L); + assertSameSource(results, 2, 2L); + assertFalse(results.hasNext()); + } + + @Test + public void simpleoverlapTest() { + Map params = map("config", map("similarityCutoff", 0.0)); + + Map row = db.execute(STATEMENT,params).next(); + assertEquals((double) row.get("p25"), 1.0, 0.01); + assertEquals((double) row.get("p50"), 1.0, 0.01); + assertEquals((double) row.get("p75"), 1.0, 0.01); + assertEquals((double) row.get("p95"), 1.0, 0.01); + assertEquals((double) row.get("p99"), 1.0, 0.01); + assertEquals((double) row.get("p100"), 1.0, 0.01); + } + + @Test + public void simpleoverlapFromEmbeddingTest() { + db.execute(STORE_EMBEDDING_STATEMENT); + + Map params = map("config", map("similarityCutoff", 0.0)); + + Map row = db.execute(EMBEDDING_STATEMENT,params).next(); + System.out.println("row = " + row); + assertEquals((double) row.get("p25"), 1.0, 0.01); + assertEquals((double) row.get("p50"), 1.0, 0.01); + assertEquals((double) row.get("p75"), 1.0, 0.01); + assertEquals((double) row.get("p95"), 1.0, 0.01); + assertEquals((double) row.get("p99"), 1.0, 0.01); + assertEquals((double) row.get("p100"), 1.0, 0.01); + } + + /* + Alice [p1,p2,p3] + Bob [p1,p2] + Charlie [p3] + Dana [] + */ + + @Test + public void simpleoverlapWriteTest() { + Map params = map("config", map( "write",true, "similarityCutoff", 0.1)); + + db.execute(STATEMENT,params).close(); + + String checkSimilaritiesQuery = "MATCH (a)-[similar:NARROWER_THAN]->(b)" + + "RETURN a.name AS node1, b.name as node2, similar.score AS score " + + "ORDER BY id(a), id(b)"; + + System.out.println(db.execute(checkSimilaritiesQuery).resultAsString()); + Result result = db.execute(checkSimilaritiesQuery); + + assertTrue(result.hasNext()); + Map row = result.next(); + assertEquals(row.get("node1"), "Bob"); + assertEquals(row.get("node2"), "Alice"); + assertEquals((double) row.get("score"), 1.0, 0.01); + + assertTrue(result.hasNext()); + row = result.next(); + assertEquals(row.get("node1"), "Charlie"); + assertEquals(row.get("node2"), "Alice"); + assertEquals((double) row.get("score"), 1.0, 0.01); + + assertFalse(result.hasNext()); + } + + private void assert12(Map row) { + assertEquals(2L, row.get("item1")); + assertEquals(1L, row.get("item2")); + assertEquals(1L, row.get("count1")); + assertEquals(2L, row.get("count2")); + // assertEquals(0L, row.get("intersection")); + assertEquals(0d, row.get("similarity")); + } + + // a / b = 2 : 2/3 + // a / c = 1 : 1/3 + // b / c = 0 : 0/3 = 0 + + private void assert02(Map row) { + assertEquals(2L, row.get("item1")); + assertEquals(0L, row.get("item2")); + assertEquals(1L, row.get("count1")); + assertEquals(3L, row.get("count2")); + // assertEquals(1L, row.get("intersection")); + assertEquals(1d/1d, row.get("similarity")); + } + + private void assert01(Map row) { + assertEquals(1L, row.get("item1")); + assertEquals(0L, row.get("item2")); + assertEquals(2L, row.get("count1")); + assertEquals(3L, row.get("count2")); + // assertEquals(2L, row.get("intersection")); + assertEquals(2d/2d, row.get("similarity")); + } +} From 075f0bd2b82c247b737a8aa2b72fa836607a817b Mon Sep 17 00:00:00 2001 From: Mark Needham Date: Mon, 1 Oct 2018 11:36:15 +0100 Subject: [PATCH 2/6] handle returning the smaller set first for topK --- .../similarity/CategoricalInput.java | 4 +- .../graphalgo/similarity/SimilarityProc.java | 60 +++++++++++-------- .../similarity/SimilarityResult.java | 11 +++- .../algo/similarity/OverlapTest.java | 16 ++--- 4 files changed, 54 insertions(+), 37 deletions(-) diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/CategoricalInput.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/CategoricalInput.java index bc5ec742d..f0bddf8c0 100644 --- a/algo/src/main/java/org/neo4j/graphalgo/similarity/CategoricalInput.java +++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/CategoricalInput.java @@ -37,9 +37,9 @@ SimilarityResult overlap(double similarityCutoff, CategoricalInput e2) { if (overlap < similarityCutoff) return null; if(count1 <= count2) { - return new SimilarityResult(id, e2.id, count1, count2, intersection, overlap); + return new SimilarityResult(id, e2.id, count1, count2, intersection, overlap, false, false); } else { - return new SimilarityResult(e2.id, id, count2, count1, intersection, overlap); + return new SimilarityResult(e2.id, id, count2, count1, intersection, overlap, false, true); } } diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java index 9c1126422..237c18643 100644 --- a/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java +++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityProc.java @@ -51,7 +51,7 @@ static Stream topN(Stream stream, int topN) if (topN > 10000) { return stream.sorted(comparator).limit(topN); } - return topK(stream,topN, comparator); + return topK(stream, topN, comparator); } private static void put(BlockingQueue queue, T items) { @@ -77,7 +77,7 @@ Stream writeAndAggregateResults(ProcedureConfiguration similarityPairs.getAndIncrement(); }; - if(write) { + if (write) { SimilarityExporter similarityExporter = new SimilarityExporter(api, writeRelationshipType, writeProperty); similarityExporter.export(stream.peek(recorder)); } else { @@ -114,17 +114,15 @@ Stream similarityStream(T[] inputs, SimilarityComputer private Stream similarityStream(T[] inputs, int length, double similiarityCutoff, SimilarityComputer computer) { return IntStream.range(0, length) .boxed().flatMap(sourceId -> IntStream.range(sourceId + 1, length) - .mapToObj(targetId -> computer.similarity(inputs[sourceId],inputs[targetId],similiarityCutoff)).filter(Objects::nonNull)); + .mapToObj(targetId -> computer.similarity(inputs[sourceId], inputs[targetId], similiarityCutoff)).filter(Objects::nonNull)); } private Stream similarityStreamTopK(T[] inputs, int length, double cutoff, int topK, SimilarityComputer computer) { TopKConsumer[] topKHolder = initializeTopKConsumers(length, topK); - for (int sourceId = 0;sourceId < length;sourceId++) { - computeSimilarityForSourceIndex(sourceId, inputs, length, cutoff, (sourceIndex, targetIndex, similarityResult) -> { - topKHolder[sourceIndex].accept(similarityResult); - topKHolder[targetIndex].accept(similarityResult.reverse()); - }, computer); + SimilarityConsumer consumer = assignSimilarityPairs(topKHolder); + for (int sourceId = 0; sourceId < length; sourceId++) { + computeSimilarityForSourceIndex(sourceId, inputs, length, cutoff, consumer, computer); } return Arrays.stream(topKHolder).flatMap(TopKConsumer::stream); } @@ -176,13 +174,13 @@ private Stream similarityParallelStreamTopK(T[] inputs, in ParallelUtil.runWithConcurrency(concurrency, tasks, terminationFlag, Pools.DEFAULT); TopKConsumer[] topKConsumers = initializeTopKConsumers(length, topK); - for (Runnable task : tasks) ((TopKTask)task).mergeInto(topKConsumers); + for (Runnable task : tasks) ((TopKTask) task).mergeInto(topKConsumers); return Arrays.stream(topKConsumers).flatMap(TopKConsumer::stream); } private void computeSimilarityForSourceIndex(int sourceId, T[] inputs, int length, double cutoff, SimilarityConsumer consumer, SimilarityComputer computer) { - for (int targetId=sourceId+1;targetId> data, long degree for (Map row : data) { List targetIds = extractValues(row.get("categories")); int size = targetIds.size(); - if ( size > degreeCutoff) { + if (size > degreeCutoff) { long[] targets = new long[size]; - int i=0; + int i = 0; for (Number id : targetIds) { - targets[i++]=id.longValue(); + targets[i++] = id.longValue(); } Arrays.sort(targets); ids[idx++] = new CategoricalInput((Long) row.get("item"), targets); @@ -218,11 +216,11 @@ WeightedInput[] prepareWeights(List> data, long degreeCutoff List weightList = extractValues(row.get("weights")); int size = weightList.size(); - if ( size > degreeCutoff) { + if (size > degreeCutoff) { double[] weights = new double[size]; - int i=0; + int i = 0; for (Number value : weightList) { - weights[i++]=value.doubleValue(); + weights[i++] = value.doubleValue(); } inputs[idx++] = new WeightedInput((Long) row.get("item"), weights); } @@ -233,7 +231,7 @@ WeightedInput[] prepareWeights(List> data, long degreeCutoff } private List extractValues(Object rawValues) { - if(rawValues == null) { + if (rawValues == null) { return Collections.emptyList(); } @@ -259,13 +257,24 @@ protected int getTopK(ProcedureConfiguration configuration) { } protected int getTopN(ProcedureConfiguration configuration) { - return configuration.getInt("top",0); + return configuration.getInt("top", 0); } interface SimilarityComputer { SimilarityResult similarity(T source, T target, double cutoff); } + public static SimilarityConsumer assignSimilarityPairs(TopKConsumer[] topKConsumers) { + return (s, t, result) -> { + topKConsumers[result.reversed ? t : s].accept(result); + + if (result.bidirectional) { + SimilarityResult reverse = result.reverse(); + topKConsumers[reverse.reversed ? t : s].accept(reverse); + } + }; + } + private class TopKTask implements Runnable { private final int batchSize; private final int taskOffset; @@ -273,10 +282,10 @@ private class TopKTask implements Runnable { private final int length; private final T[] ids; private final double similiarityCutoff; - private final SimilarityComputer computer; + private final SimilarityComputer computer; private final TopKConsumer[] topKConsumers; - TopKTask(int batchSize, int taskOffset, int multiplier, int length, T[] ids, double similiarityCutoff, int topK, SimilarityComputer computer) { + TopKTask(int batchSize, int taskOffset, int multiplier, int length, T[] ids, double similiarityCutoff, int topK, SimilarityComputer computer) { this.batchSize = batchSize; this.taskOffset = taskOffset; this.multiplier = multiplier; @@ -289,16 +298,17 @@ private class TopKTask implements Runnable { @Override public void run() { + SimilarityConsumer consumer = assignSimilarityPairs(topKConsumers); for (int offset = 0; offset < batchSize; offset++) { int sourceId = taskOffset * multiplier + offset; if (sourceId < length) { - computeSimilarityForSourceIndex(sourceId, ids, length, similiarityCutoff, (s, t, result) -> { - topKConsumers[s].accept(result); - topKConsumers[t].accept(result.reverse()); - }, computer); + + computeSimilarityForSourceIndex(sourceId, ids, length, similiarityCutoff, consumer, computer); } } } + + void mergeInto(TopKConsumer[] target) { for (int i = 0; i < target.length; i++) { target[i].accept(topKConsumers[i]); diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityResult.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityResult.java index 02d9441df..ead39a726 100644 --- a/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityResult.java +++ b/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityResult.java @@ -30,16 +30,23 @@ public class SimilarityResult implements Comparable { public final long count2; public final long intersection; public double similarity; + public final boolean bidirectional; + public final boolean reversed; public static SimilarityResult TOMB = new SimilarityResult(-1, -1, -1, -1, -1, -1); - public SimilarityResult(long item1, long item2, long count1, long count2, long intersection, double similarity) { + public SimilarityResult(long item1, long item2, long count1, long count2, long intersection, double similarity, boolean bidirectional, boolean reversed) { this.item1 = item1; this.item2 = item2; this.count1 = count1; this.count2 = count2; this.intersection = intersection; this.similarity = similarity; + this.bidirectional = bidirectional; + this.reversed = reversed; + } + public SimilarityResult(long item1, long item2, long count1, long count2, long intersection, double similarity) { + this(item1,item2, count1,count2,intersection,similarity, true, false); } @Override @@ -70,7 +77,7 @@ public int compareTo(SimilarityResult o) { } public SimilarityResult reverse() { - return new SimilarityResult(item2, item1,count2,count1,intersection,similarity); + return new SimilarityResult(item2, item1,count2,count1,intersection,similarity,bidirectional,!reversed); } public SimilarityResult squareRooted() { diff --git a/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java b/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java index d2d2b9ebe..ead065d98 100644 --- a/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java +++ b/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/OverlapTest.java @@ -159,8 +159,9 @@ public void overlapSingleMultiThreadComparisionTopK() { assertEquals(row1.toString(), row1,result8.next()); count++; } - int people = size/10; - assertEquals(people,count); + assertFalse(result2.hasNext()); + assertFalse(result4.hasNext()); + assertFalse(result8.hasNext()); } @Test @@ -190,8 +191,7 @@ public void topKoverlapStreamTest() { Result results = db.execute(STATEMENT_STREAM, params); assertTrue(results.hasNext()); assert01(results.next()); - assert01(flip(results.next())); - assert02(flip(results.next())); + assert02(results.next()); assertFalse(results.hasNext()); } @@ -221,8 +221,8 @@ public void topK4overlapStreamTest() { System.out.println(db.execute(STATEMENT_STREAM,params).resultAsString()); Result results = db.execute(STATEMENT_STREAM,params); - assertSameSource(results, 2, 0L); - assertSameSource(results, 2, 1L); + assertSameSource(results, 0, 0L); + assertSameSource(results, 1, 1L); assertSameSource(results, 2, 2L); assertFalse(results.hasNext()); } @@ -234,8 +234,8 @@ public void topK3overlapStreamTest() { System.out.println(db.execute(STATEMENT_STREAM, params).resultAsString()); Result results = db.execute(STATEMENT_STREAM, params); - assertSameSource(results, 2, 0L); - assertSameSource(results, 2, 1L); + assertSameSource(results, 0, 0L); + assertSameSource(results, 1, 1L); assertSameSource(results, 2, 2L); assertFalse(results.hasNext()); } From 899c44d8acf5f43d2caa5c0662635358d051311b Mon Sep 17 00:00:00 2001 From: Mark Needham Date: Mon, 1 Oct 2018 12:53:20 +0100 Subject: [PATCH 3/6] overlap docs --- .../scripts/similarity-overlap.cypher | 76 ++++++ doc/asciidoc/similarity-overlap.adoc | 248 ++++++++++++++++++ 2 files changed, 324 insertions(+) create mode 100644 doc/asciidoc/scripts/similarity-overlap.cypher create mode 100644 doc/asciidoc/similarity-overlap.adoc diff --git a/doc/asciidoc/scripts/similarity-overlap.cypher b/doc/asciidoc/scripts/similarity-overlap.cypher new file mode 100644 index 000000000..1d1bd5546 --- /dev/null +++ b/doc/asciidoc/scripts/similarity-overlap.cypher @@ -0,0 +1,76 @@ +// tag::function[] +RETURN algo.similarity.overlap([1,2,3], [1,2,4,5]) AS similarity +// end::function[] + +// tag::create-sample-graph[] + +MERGE (fahrenheit451:Book {title:'Fahrenheit 451'}) +MERGE (dune:Book {title:'dune'}) +MERGE (hungerGames:Book {title:'The Hunger Games'}) +MERGE (nineteen84:Book {title:'1984'}) + +MERGE (scienceFiction:Genre {name: "Science Fiction"}) +MERGE (fantasy:Genre {name: "Fantasy"}) +MERGE (dystopia:Genre {name: "Dystopia"}) + +MERGE (fahrenheit451)-[:HAS_GENRE]->(dystopia) +MERGE (fahrenheit451)-[:HAS_GENRE]->(scienceFiction) +MERGE (fahrenheit451)-[:HAS_GENRE]->(fantasy) + +MERGE (hungerGames)-[:HAS_GENRE]->(scienceFiction) +MERGE (hungerGames)-[:HAS_GENRE]->(fantasy) +MERGE (hungerGames)-[:HAS_GENRE]->(romance) + +MERGE (nineteen84)-[:HAS_GENRE]->(scienceFiction) +MERGE (nineteen84)-[:HAS_GENRE]->(dystopia) + +MERGE (dune)-[:HAS_GENRE]->(scienceFiction) +MERGE (dune)-[:HAS_GENRE]->(fantasy) + +// end::create-sample-graph[] + +// tag::stream[] +MATCH (p:Person)-[:LIKES]->(cuisine) +WITH {item:id(p), categories: collect(id(cuisine))} as userData +WITH collect(userData) as data +CALL algo.similarity.jaccard.stream(data) +YIELD item1, item2, count1, count2, intersection, similarity +RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to, intersection, similarity +ORDER BY similarity DESC +// end::stream[] + +// tag::stream-similarity-cutoff[] +MATCH (p:Person)-[:LIKES]->(cuisine) +WITH {item:id(p), categories: collect(id(cuisine))} as userData +WITH collect(userData) as data +CALL algo.similarity.jaccard.stream(data, {similarityCutoff: 0.0}) +YIELD item1, item2, count1, count2, intersection, similarity +RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to, intersection, similarity +ORDER BY similarity DESC +// end::stream-similarity-cutoff[] + +// tag::stream-topk[] +MATCH (p:Person)-[:LIKES]->(cuisine) +WITH {item:id(p), categories: collect(id(cuisine))} as userData +WITH collect(userData) as data +CALL algo.similarity.jaccard.stream(data, {topK: 1, similarityCutoff: 0.0}) +YIELD item1, item2, count1, count2, intersection, similarity +RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to, similarity +ORDER BY from +// end::stream-topk[] + +// tag::write-back[] +MATCH (p:Person)-[:LIKES]->(cuisine) +WITH {item:id(p), categories: collect(id(cuisine))} as userData +WITH collect(userData) as data +CALL algo.similarity.jaccard(data, {topK: 1, similarityCutoff: 0.1, write:true}) +YIELD nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, stdDev, p25, p50, p75, p90, p95, p99, p999, p100 +RETURN nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, p95 +// end::write-back[] + +// tag::query[] +MATCH (p:Person {name: "Praveena"})-[:SIMILAR]->(other), + (other)-[:LIKES]->(cuisine) +WHERE not((p)-[:LIKES]->(cuisine)) +RETURN cuisine.name AS cuisine +// end::query[] diff --git a/doc/asciidoc/similarity-overlap.adoc b/doc/asciidoc/similarity-overlap.adoc new file mode 100644 index 000000000..187f80841 --- /dev/null +++ b/doc/asciidoc/similarity-overlap.adoc @@ -0,0 +1,248 @@ +[[algorithms-similarity-overlap]] += The Overlap Similarity algorithm + +[abstract] +-- +This section describes the Overlap Similarity algorithm in the Neo4j Graph Algorithms library. +-- + +// tag::introduction[] +link:https://en.wikipedia.org/wiki/Overlap_coefficient[Overlap similarity] measures overlap between two sets. +It is defined as the size of the intersection of two sets divided by the size of the smaller of the two sets +// end::introduction[] + + +[[algorithms-similarity-overlap-context]] +== History and explanation + +// tag::explanation[] + +Overlap similarity is computed using the following formula: + +image::jaccard.png[role="middle"] + +// This is the raw information for this image: +// ``` +// O(A,B) = ∣A ∩ B∣ / min(∣A|,|B|) +// ``` + +The library contains both procedures and functions to calculate similarity between sets of data. +The function is best used when calculating the similarity between small numbers of sets. +The procedures parallelize the computation and are therefore a better bet when computing similarities on bigger datasets. + +// end::explanation[] + +[[algorithms-similarity-overlap-usecase]] +== Use-cases - when to use the Overlap Similarity algorithm + +// tag::use-case[] +We can use the Overlap Similarity algorithm to work out the similarity between two things. +We might then use the computed similarity as part of a recommendation query. +For example, you can use the Overlap Similarity algorithm to show the products that were purchased by similar customers, in terms of previous products purchased. +// end::use-case[] + + +[[algorithms-similarity-overlap-sample]] +== Overlap Similarity algorithm sample + +.The following will return the Overlap similarity of two lists of numbers: +[source, cypher] +---- +include::scripts/similarity-overlap.cypher[tag=function] +---- + +// tag::function[] +.Results +[opts="header",cols="1"] +|=== +| `similarity` +| 0.4 +|=== +// end::function[] + +// tag::function-explanation[] +These two lists of numbers have a Overlap similarity of 0.4. +We can see how this result is derived by breaking down the formula: + +``` +J(A,B) = ∣A ∩ B∣ / ∣A∣ + ∣B∣ - ∣A ∩ B| +J(A,B) = 2 / 3 + 4 - 2 + = 2 / 5 + = 0.4 +``` +// end::function-explanation[] + +.The following will create a sample graph: +[source, cypher] +---- +include::scripts/similarity-overlap.cypher[tag=create-sample-graph] +---- + +.The following will return a stream of node pairs along with their intersection and Overlap similarities: +[source, cypher] +---- +include::scripts/similarity-overlap.cypher[tag=stream] +---- + +// tag::stream[] +.Results +[opts="header",cols="1,1,1,1"] +|=== +| From | To | Intersection | Similarity +| Arya | Karin | 2 | 0.66 +| Zhen | Michael | 2 | 0.66 +| Zhen | Praveena | 1 | 0.33 +| Michael | Karin | 1 | 0.25 +| Praveena | Michael | 1 | 0.25 +| Praveena | Arya | 1 | 0.25 +| Michael | Arya | 1 | 0.2 +| Praveena | Karin | 0 | 0 +| Zhen | Arya | 0 | 0 +| Zhen | Karin | 0 | 0 +|=== +// end::stream[] + +Arya and Karin, and Zhen and Michael have the most similar food preferences, with two overlapping cuisines for a similarity of 0.66. +We also have 3 pairs of users who are not similar at all. +We'd probably want to filter those out, which we can do by passing in the `similarityCutoff` parameter. + +.The following will return a stream of node pairs that have a similarity of at least 0.1, along with their intersection and Overlap similarities: +[source, cypher] +---- +include::scripts/similarity-overlap.cypher[tag=stream-similarity-cutoff] +---- + +// tag::stream-similarity-cutoff[] +.Results +[opts="header",cols="1,1,1,1"] +|=== +| `from` | `to` | `intersection` | `similarity` +| Arya | Karin | 2 | 0.66 +| Zhen | Michael | 2 | 0.66 +| Zhen | Praveena | 1 | 0.33 +| Michael | Karin | 1 | 0.25 +| Praveena | Michael | 1 | 0.25 +| Praveena | Arya | 1 | 0.25 +| Michael | Arya | 1 | 0.2 +|=== +// end::stream-similarity-cutoff[] + +We can see that those users with no similarity have been filtered out. +If we're implementing a k-Nearest Neighbors type query we might instead want to find the most similar `k` users for a given user. +We can do that by passing in the `topK` parameter. + +.The following will return a stream of users along with the most similar user to them (i.e. `k=1`): +[source, cypher] +---- +include::scripts/similarity-overlap.cypher[tag=stream-topk] +---- + +// tag::stream-topk[] +.Results +[opts="header",cols="1,1,1"] +|=== +| `from` | `to` | `similarity` +| Arya | Karin | 0.66 +| Karin | Arya | 0.66 +| Michael | Zhen | 0.66 +| Praveena | Zhen | 0.33 +| Zhen | Michael | 0.66 +|=== +// end::stream-topk[] + +These results will not be symmetrical. +For example, the person most similar to Praveena is Zhen, but the person most similar to Zhen is actually Michael. + +.Parameters +[opts="header",cols="1,1,1,1,4"] +|=== +| Name | Type | Default | Optional | Description +| `data` | list | null | no | A list of maps of the following structure: `{item: nodeId, categories: [nodeId, nodeId, nodeId]}` +| `top` | int | 0 | yes | The number of similar pairs to return. If `0`, it will return as many as it finds. +| `topK` | int | 0 | yes | The number of similar values to return per node. If `0`, it will return as many as it finds. +| `similarityCutoff` | int | -1 | yes | The threshold for Overlap similarity. Values below this will not be returned. +| `degreeCutoff` | int | 0 | yes | The threshold for the number of items in the `targets` list. If the list contains less than this amount, that node will be excluded from the calculation. +| `concurrency` | int | available CPUs | yes | The number of concurrent threads. +|=== + +.Results +[opts="header",cols="1,1,6"] +|=== +| Name | Type | Description +| `item1` | int | The ID of one node in the similarity pair. +| `item2` | int | The ID of other node in the similarity pair. +| `count1` | int | The size of the `targets` list of one node. +| `count2` | int | The size of the `targets` list of other node. +| `intersection` | int | The number of intersecting values in the two nodes `targets` lists. +| `similarity` | int | The Overlap similarity of the two nodes. +|=== + +.The following will find the most similar user for each user, and store a relationship between those users: +[source, cypher] +---- +include::scripts/similarity-overlap.cypher[tag=write-back] +---- + +// tag::write-back[] +.Results +[opts="header",cols="1,1,1,1,1,1,1,1,1"] +|=== +| `nodes` | `similarityPairs` | `write` | `writeRelationshipType` | `writeProperty` | `min` | `max` | `mean` | `p95` +| 5 | 5 | true | SIMILAR | score | 0.33 | 0.66 | 0.59 | 0.66 +|=== +// end::write-back[] + +We then could write a query to find out what types of cuisine that other people similar to us might like. + +.The following will find the most similar user to Praveena, and return their favorite cuisines that Praveena doesn't (yet!) like: +[source, cypher] +---- +include::scripts/similarity-overlap.cypher[tag=query] +---- + +// tag::query[] +.Results +[opts="header",cols="1"] +|=== +| `cuisine` +| French +|=== +// end::query[] + +.Parameters +[opts="header",cols="1,1,1,1,4"] +|=== +| Name | Type | Default | Optional | Description +| `data` | list | null | no | A list of maps of the following structure: `{item: nodeId, categories: [nodeId, nodeId, nodeId]}` +| `top` | int | 0 | yes | The number of similar pairs to return. If `0`, it will return as many as it finds. +| `topK` | int | 0 | yes | The number of similar values to return per node. If `0`, it will return as many as it finds. +| `similarityCutoff` | int | -1 | yes | The threshold for Overlap similarity. Values below this will not be returned. +| `degreeCutoff` | int | 0 | yes | The threshold for the number of items in the `targets` list. If the list contains less than this amount, that node will be excluded from the calculation. +| `concurrency` | int | available CPUs | yes | The number of concurrent threads. +| `write` | boolean | false | yes | Indicates whether results should be stored. +| `writeRelationshipType` | string | SIMILAR | yes | The relationship type to use when storing results. +| `writeProperty` | string | score | yes | The property to use when storing results. +|=== + +.Results +[opts="header",cols="1,1,6"] +|=== +| Name | Type | Description +| `nodes` | int | The number of nodes passed in. +| `similarityPairs` | int | The number of pairs of similar nodes computed. +| `write` | boolean | Indicates whether results were stored. +| `writeRelationshipType` | string | The relationship type used when storing results. +| `writeProperty` | string | The property used when storing results. +| `min` | double | The minimum similarity score computed. +| `max` | double | The maximum similarity score computed. +| `mean` | double | The mean of similarities scores computed. +| `stdDev` | double | The standard deviation of similarities scores computed. +| `p25` | double | The 25 percentile of similarities scores computed. +| `p50` | double | The 50 percentile of similarities scores computed. +| `p75` | double | The 75 percentile of similarities scores computed. +| `p90` | double | The 90 percentile of similarities scores computed. +| `p95` | double | The 95 percentile of similarities scores computed. +| `p99` | double | The 99 percentile of similarities scores computed. +| `p999` | double | The 99.9 percentile of similarities scores computed. +| `p100` | double | The 25 percentile of similarities scores computed. +|=== From 328eff5ce117ef686628d7ab8f62141a4d17a079 Mon Sep 17 00:00:00 2001 From: Mark Needham Date: Mon, 1 Oct 2018 13:33:22 +0100 Subject: [PATCH 4/6] more examples --- .../scripts/similarity-overlap.cypher | 33 +++++++----- doc/asciidoc/similarity-overlap.adoc | 51 ++++++++----------- 2 files changed, 43 insertions(+), 41 deletions(-) diff --git a/doc/asciidoc/scripts/similarity-overlap.cypher b/doc/asciidoc/scripts/similarity-overlap.cypher index 1d1bd5546..ef7ad4842 100644 --- a/doc/asciidoc/scripts/similarity-overlap.cypher +++ b/doc/asciidoc/scripts/similarity-overlap.cypher @@ -5,17 +5,20 @@ RETURN algo.similarity.overlap([1,2,3], [1,2,4,5]) AS similarity // tag::create-sample-graph[] MERGE (fahrenheit451:Book {title:'Fahrenheit 451'}) -MERGE (dune:Book {title:'dune'}) +MERGE (dune:Book {title:'Dune'}) MERGE (hungerGames:Book {title:'The Hunger Games'}) MERGE (nineteen84:Book {title:'1984'}) +MERGE (gatsby:Book {title:'The Great Gatsby'}) MERGE (scienceFiction:Genre {name: "Science Fiction"}) MERGE (fantasy:Genre {name: "Fantasy"}) MERGE (dystopia:Genre {name: "Dystopia"}) +MERGE (classics:Genre {name: "Classics"}) MERGE (fahrenheit451)-[:HAS_GENRE]->(dystopia) MERGE (fahrenheit451)-[:HAS_GENRE]->(scienceFiction) MERGE (fahrenheit451)-[:HAS_GENRE]->(fantasy) +MERGE (fahrenheit451)-[:HAS_GENRE]->(classics) MERGE (hungerGames)-[:HAS_GENRE]->(scienceFiction) MERGE (hungerGames)-[:HAS_GENRE]->(fantasy) @@ -23,37 +26,43 @@ MERGE (hungerGames)-[:HAS_GENRE]->(romance) MERGE (nineteen84)-[:HAS_GENRE]->(scienceFiction) MERGE (nineteen84)-[:HAS_GENRE]->(dystopia) +MERGE (nineteen84)-[:HAS_GENRE]->(classics) MERGE (dune)-[:HAS_GENRE]->(scienceFiction) MERGE (dune)-[:HAS_GENRE]->(fantasy) +MERGE (dune)-[:HAS_GENRE]->(classics) + +MERGE (gatsby)-[:HAS_GENRE]->(classics) // end::create-sample-graph[] // tag::stream[] -MATCH (p:Person)-[:LIKES]->(cuisine) -WITH {item:id(p), categories: collect(id(cuisine))} as userData +MATCH (book:Book)-[:HAS_GENRE]->(genre) +WITH {item:id(genre), categories: collect(id(book))} as userData WITH collect(userData) as data -CALL algo.similarity.jaccard.stream(data) +CALL algo.similarity.overlap.stream(data) YIELD item1, item2, count1, count2, intersection, similarity -RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to, intersection, similarity +RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to, + count1, count2, intersection, similarity ORDER BY similarity DESC // end::stream[] // tag::stream-similarity-cutoff[] -MATCH (p:Person)-[:LIKES]->(cuisine) -WITH {item:id(p), categories: collect(id(cuisine))} as userData +MATCH (book:Book)-[:HAS_GENRE]->(genre) +WITH {item:id(genre), categories: collect(id(book))} as userData WITH collect(userData) as data -CALL algo.similarity.jaccard.stream(data, {similarityCutoff: 0.0}) +CALL algo.similarity.overlap.stream(data, {similarityCutoff: 0.75}) YIELD item1, item2, count1, count2, intersection, similarity -RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to, intersection, similarity +RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to, + count1, count2, intersection, similarity ORDER BY similarity DESC // end::stream-similarity-cutoff[] // tag::stream-topk[] -MATCH (p:Person)-[:LIKES]->(cuisine) -WITH {item:id(p), categories: collect(id(cuisine))} as userData +MATCH (book:Book)-[:HAS_GENRE]->(genre) +WITH {item:id(genre), categories: collect(id(book))} as userData WITH collect(userData) as data -CALL algo.similarity.jaccard.stream(data, {topK: 1, similarityCutoff: 0.0}) +CALL algo.similarity.jaccard.stream(data, {topK: 1}) YIELD item1, item2, count1, count2, intersection, similarity RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to, similarity ORDER BY from diff --git a/doc/asciidoc/similarity-overlap.adoc b/doc/asciidoc/similarity-overlap.adoc index 187f80841..72493cea1 100644 --- a/doc/asciidoc/similarity-overlap.adoc +++ b/doc/asciidoc/similarity-overlap.adoc @@ -86,27 +86,23 @@ include::scripts/similarity-overlap.cypher[tag=stream] // tag::stream[] .Results -[opts="header",cols="1,1,1,1"] -|=== -| From | To | Intersection | Similarity -| Arya | Karin | 2 | 0.66 -| Zhen | Michael | 2 | 0.66 -| Zhen | Praveena | 1 | 0.33 -| Michael | Karin | 1 | 0.25 -| Praveena | Michael | 1 | 0.25 -| Praveena | Arya | 1 | 0.25 -| Michael | Arya | 1 | 0.2 -| Praveena | Karin | 0 | 0 -| Zhen | Arya | 0 | 0 -| Zhen | Karin | 0 | 0 +[opts="header"] +|=== +| `from` | `to` | `count1` | `count2` | `intersection` | `similarity` +| Fantasy | Science Fiction | 3 | 4 | 3 | 1.0 +| Dystopia | Science Fiction | 2 | 4 | 2 | 1.0 +| Dystopia | Classics | 2 | 4 | 2 | 1.0 +| Science Fiction | Classics | 4 | 4 | 3 | 0.75 +| Fantasy | Classics | 3 | 4 | 2 | 0.66 +| Dystopia | Fantasy | 2 | 3 | 1 | 0.5 |=== // end::stream[] -Arya and Karin, and Zhen and Michael have the most similar food preferences, with two overlapping cuisines for a similarity of 0.66. -We also have 3 pairs of users who are not similar at all. -We'd probably want to filter those out, which we can do by passing in the `similarityCutoff` parameter. +Fantasy and Dystopia are both clear sub genres of Science Fiction - 100% of the books that list those as genres also list Science Fiction as a genre. +Dystopia is also a sub genre of Classics +The others are less obvious - Dystopia probably isn't a sub genre of Fantasy, but the other two pairs could be sub genres. -.The following will return a stream of node pairs that have a similarity of at least 0.1, along with their intersection and Overlap similarities: +.The following will return a stream of node pairs that have a similarity of at least 0.75, along with their intersection and Overlap similarities: [source, cypher] ---- include::scripts/similarity-overlap.cypher[tag=stream-similarity-cutoff] @@ -114,24 +110,21 @@ include::scripts/similarity-overlap.cypher[tag=stream-similarity-cutoff] // tag::stream-similarity-cutoff[] .Results -[opts="header",cols="1,1,1,1"] +[opts="header"] |=== -| `from` | `to` | `intersection` | `similarity` -| Arya | Karin | 2 | 0.66 -| Zhen | Michael | 2 | 0.66 -| Zhen | Praveena | 1 | 0.33 -| Michael | Karin | 1 | 0.25 -| Praveena | Michael | 1 | 0.25 -| Praveena | Arya | 1 | 0.25 -| Michael | Arya | 1 | 0.2 +| `from` | `to` | `count1` | `count2` | `intersection` | `similarity` +| Fantasy | Science Fiction | 3 | 4 | 3 | 1.0 +| Dystopia | Science Fiction | 2 | 4 | 2 | 1.0 +| Dystopia | Classics | 2 | 4 | 2 | 1.0 +| Science Fiction | Classics | 4 | 4 | 3 | 0.75 |=== // end::stream-similarity-cutoff[] -We can see that those users with no similarity have been filtered out. -If we're implementing a k-Nearest Neighbors type query we might instead want to find the most similar `k` users for a given user. +We can see that those genres with lower similarity have been filtered out. +If we're implementing a k-Nearest Neighbors type query we might instead want to find the most similar `k` super genres for a given genre. We can do that by passing in the `topK` parameter. -.The following will return a stream of users along with the most similar user to them (i.e. `k=1`): +.The following will return a stream of genres along with the most similar super category to them (i.e. `k=1`): [source, cypher] ---- include::scripts/similarity-overlap.cypher[tag=stream-topk] From 88d1d3c04f8d363858000aff952ae5f1817cb394 Mon Sep 17 00:00:00 2001 From: Mark Needham Date: Mon, 1 Oct 2018 13:53:59 +0100 Subject: [PATCH 5/6] more examples --- .../scripts/similarity-overlap.cypher | 18 +++++----- doc/asciidoc/similarity-overlap.adoc | 34 +++++++++---------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/doc/asciidoc/scripts/similarity-overlap.cypher b/doc/asciidoc/scripts/similarity-overlap.cypher index ef7ad4842..52cf0ed3d 100644 --- a/doc/asciidoc/scripts/similarity-overlap.cypher +++ b/doc/asciidoc/scripts/similarity-overlap.cypher @@ -62,24 +62,24 @@ ORDER BY similarity DESC MATCH (book:Book)-[:HAS_GENRE]->(genre) WITH {item:id(genre), categories: collect(id(book))} as userData WITH collect(userData) as data -CALL algo.similarity.jaccard.stream(data, {topK: 1}) +CALL algo.similarity.overlap.stream(data, {topK: 2}) YIELD item1, item2, count1, count2, intersection, similarity -RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to, similarity +RETURN algo.getNodeById(item1).name AS from, algo.getNodeById(item2).name AS to, + count1, count2, intersection, similarity ORDER BY from // end::stream-topk[] // tag::write-back[] -MATCH (p:Person)-[:LIKES]->(cuisine) -WITH {item:id(p), categories: collect(id(cuisine))} as userData +MATCH (book:Book)-[:HAS_GENRE]->(genre) +WITH {item:id(genre), categories: collect(id(book))} as userData WITH collect(userData) as data -CALL algo.similarity.jaccard(data, {topK: 1, similarityCutoff: 0.1, write:true}) +CALL algo.similarity.overlap(data, {topK: 2, similarityCutoff: 0.5, write:true}) YIELD nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, stdDev, p25, p50, p75, p90, p95, p99, p999, p100 RETURN nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, p95 // end::write-back[] // tag::query[] -MATCH (p:Person {name: "Praveena"})-[:SIMILAR]->(other), - (other)-[:LIKES]->(cuisine) -WHERE not((p)-[:LIKES]->(cuisine)) -RETURN cuisine.name AS cuisine +MATCH path = (fantasy:Genre {name: "Fantasy"})-[:NARROWER_THAN*]->(genre) +RETURN [node in nodes(path) | node.name] AS hierarchy +ORDER BY length(path) // end::query[] diff --git a/doc/asciidoc/similarity-overlap.adoc b/doc/asciidoc/similarity-overlap.adoc index 72493cea1..ac6eb5cf6 100644 --- a/doc/asciidoc/similarity-overlap.adoc +++ b/doc/asciidoc/similarity-overlap.adoc @@ -124,7 +124,7 @@ We can see that those genres with lower similarity have been filtered out. If we're implementing a k-Nearest Neighbors type query we might instead want to find the most similar `k` super genres for a given genre. We can do that by passing in the `topK` parameter. -.The following will return a stream of genres along with the most similar super category to them (i.e. `k=1`): +.The following will return a stream of genres along with the two most similar super genres to them (i.e. `k=2`): [source, cypher] ---- include::scripts/similarity-overlap.cypher[tag=stream-topk] @@ -132,19 +132,17 @@ include::scripts/similarity-overlap.cypher[tag=stream-topk] // tag::stream-topk[] .Results -[opts="header",cols="1,1,1"] +[opts="header"] |=== -| `from` | `to` | `similarity` -| Arya | Karin | 0.66 -| Karin | Arya | 0.66 -| Michael | Zhen | 0.66 -| Praveena | Zhen | 0.33 -| Zhen | Michael | 0.66 +| `from` | `to` | `count1` | `count2` | `intersection` | `similarity` +| Dystopia | Classics | 2 | 4 | 2 | 1.0 +| Dystopia | Science Fiction | 2 | 4 | 2 | 1.0 +| Fantasy | Science Fiction | 3 | 4 | 3 | 1.0 +| Fantasy | Classics | 3 | 4 | 2 | 0.6666666666666666 +| Science Fiction | Classics | 4 | 4 | 3 | 0.75 |=== // end::stream-topk[] -These results will not be symmetrical. -For example, the person most similar to Praveena is Zhen, but the person most similar to Zhen is actually Michael. .Parameters [opts="header",cols="1,1,1,1,4"] @@ -178,16 +176,16 @@ include::scripts/similarity-overlap.cypher[tag=write-back] // tag::write-back[] .Results -[opts="header",cols="1,1,1,1,1,1,1,1,1"] +[opts="header"] |=== | `nodes` | `similarityPairs` | `write` | `writeRelationshipType` | `writeProperty` | `min` | `max` | `mean` | `p95` -| 5 | 5 | true | SIMILAR | score | 0.33 | 0.66 | 0.59 | 0.66 +| 4 | 5 | TRUE | NARROWER_THAN | score | 0.6666641235351562 | 1.0000038146972656 | 0.8833351135253906 | 1.0000038146972656 |=== // end::write-back[] -We then could write a query to find out what types of cuisine that other people similar to us might like. +We then could write a query to find out the genre hierarchy for a specific genre. -.The following will find the most similar user to Praveena, and return their favorite cuisines that Praveena doesn't (yet!) like: +.The following will find the genre hierarchy for the Fantasy genre [source, cypher] ---- include::scripts/similarity-overlap.cypher[tag=query] @@ -197,8 +195,10 @@ include::scripts/similarity-overlap.cypher[tag=query] .Results [opts="header",cols="1"] |=== -| `cuisine` -| French +| `hierarchy` +| ["Fantasy", "Science Fiction"] +| ["Fantasy", "Classics"] +| ["Fantasy", "Science Fiction", "Classics"] |=== // end::query[] @@ -213,7 +213,7 @@ include::scripts/similarity-overlap.cypher[tag=query] | `degreeCutoff` | int | 0 | yes | The threshold for the number of items in the `targets` list. If the list contains less than this amount, that node will be excluded from the calculation. | `concurrency` | int | available CPUs | yes | The number of concurrent threads. | `write` | boolean | false | yes | Indicates whether results should be stored. -| `writeRelationshipType` | string | SIMILAR | yes | The relationship type to use when storing results. +| `writeRelationshipType` | string | NARROWER_THAN | yes | The relationship type to use when storing results. | `writeProperty` | string | score | yes | The property to use when storing results. |=== From 15a5c91933ebace154590ef6a9758d2c9131f4e2 Mon Sep 17 00:00:00 2001 From: Mark Needham Date: Tue, 2 Oct 2018 11:53:31 +0100 Subject: [PATCH 6/6] link overlap similarity --- doc/asciidoc/algorithms-similarity.adoc | 2 ++ doc/docbook/content-map.xml | 4 +++- readme.adoc | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/asciidoc/algorithms-similarity.adoc b/doc/asciidoc/algorithms-similarity.adoc index 842f7a1f5..dd6b1a996 100644 --- a/doc/asciidoc/algorithms-similarity.adoc +++ b/doc/asciidoc/algorithms-similarity.adoc @@ -13,7 +13,9 @@ These algorithms help calculate the similarity of nodes: * <> (`algo.similarity.jaccard`) * <> (`algo.similarity.cosine`) * <> (`algo.similarity.euclidean`) +* <> (`algo.similarity.overlap`) include::similarity-jaccard.adoc[leveloffset=2] include::similarity-cosine.adoc[leveloffset=2] include::similarity-euclidean.adoc[leveloffset=2] +include::similarity-overlap.adoc[leveloffset=2] diff --git a/doc/docbook/content-map.xml b/doc/docbook/content-map.xml index 4e97ba77e..20205484b 100644 --- a/doc/docbook/content-map.xml +++ b/doc/docbook/content-map.xml @@ -56,13 +56,15 @@ + + - + diff --git a/readme.adoc b/readme.adoc index 7046336f9..fbf803fe6 100644 --- a/readme.adoc +++ b/readme.adoc @@ -69,6 +69,7 @@ These algorithms help calculate the similarity of nodes: * <> (`algo.similarity.jaccard`) * <> (`algo.similarity.cosine`) * <> (`algo.similarity.euclidean`) +* <> (`algo.similarity.overlap`) === Preprocessing