Skip to content

Commit

Permalink
Fixes #864: Adds text similarity/distance methods and double metaphon…
Browse files Browse the repository at this point in the history
…e text encoding.

- Added Apache commons-text dependency

- Added Levenshtein Similarity code and test

- Added Hamming Distance code and test

- Added Jaro-Winkler Distance code and test

- Added Double Metaphone text encoding and test
  • Loading branch information
nammmm committed Jul 19, 2018
1 parent aba1611 commit d69f15e
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 4 deletions.
1 change: 1 addition & 0 deletions build.gradle
Expand Up @@ -127,6 +127,7 @@ dependencies {
compile group: 'com.github.javafaker', name: 'javafaker', version: '0.10'

compile group: 'org.apache.commons', name: 'commons-math3', version: '3.6.1'
compile group: 'org.apache.commons', name: 'commons-text', version: '1.2'
jmh group: 'org.neo4j', name: 'neo4j-lucene-index', version: neo4jVersionEffective
jmh group: 'org.neo4j', name: 'neo4j-kernel', version: neo4jVersionEffective, classifier: "tests"

Expand Down
14 changes: 14 additions & 0 deletions src/main/java/apoc/text/Phonetic.java
@@ -1,5 +1,6 @@
package apoc.text;

import org.apache.commons.codec.language.DoubleMetaphone;
import org.neo4j.procedure.Description;
import apoc.result.LongResult;
import apoc.result.StringResult;
Expand All @@ -18,6 +19,8 @@

public class Phonetic {

private static final DoubleMetaphone DOUBLE_METAPHONE = new DoubleMetaphone();

@Procedure
@Description("apoc.text.phonetic(value) yield value - Compute the US_ENGLISH phonetic soundex encoding of all words of the text value which can be a single string or a list of strings")
public Stream<StringResult> phonetic(final @Name("value") Object value) {
Expand All @@ -38,6 +41,17 @@ public Stream<PhoneticResult> phoneticDelta(final @Name("text1") String text1, f
}
}

@Procedure
@Description("apoc.text.doubleMetaphone(value) yield value - Compute the Double Metaphone phonetic encoding of all words of the text value which can be a single string or a list of strings")
public Stream<StringResult> doubleMetaphone(final @Name("value") Object value)
{
Stream<Object> stream = value instanceof Iterable ? StreamSupport.stream(((Iterable) value).spliterator(), false) : Stream.of(value);

return stream.map(str -> (str == null || str.toString().isEmpty()) ? StringResult.EMPTY :
new StringResult(Stream.of(str.toString().trim().split("\\W+"))
.map(DOUBLE_METAPHONE::doubleMetaphone).reduce("", (a, s) -> a + s)));
}

public static class PhoneticResult {
public final String phonetic1, phonetic2;
public final long delta;
Expand Down
44 changes: 42 additions & 2 deletions src/main/java/apoc/text/Strings.java
@@ -1,6 +1,9 @@
package apoc.text;

import apoc.util.Util;
import org.apache.commons.text.similarity.HammingDistance;
import org.apache.commons.text.similarity.JaroWinklerDistance;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.Relationship;
import org.neo4j.helpers.collection.Pair;
Expand Down Expand Up @@ -36,6 +39,10 @@
*/
public class Strings {

private final static HammingDistance hammingDistance = new HammingDistance();
private final static JaroWinklerDistance jaroWinklerDistance = new JaroWinklerDistance();
private final static LevenshteinDistance levenshteinDistance = new LevenshteinDistance();

@UserFunction
@Description("apoc.text.replace(text, regex, replacement) - replace each substring of the given string that matches the given regular expression with the given replacement.")
public String replace(final @Name("text") String text, final @Name("regex") String regex, final @Name("replacement") String replacement) {
Expand Down Expand Up @@ -126,12 +133,45 @@ public boolean compareCleaned(final @Name("text1") String text1, final @Name("te
}

@UserFunction
@Description("apoc.text.distance(text1, text2) - compare the given strings with the StringUtils.distance(text1, text2) method")
@Description("apoc.text.distance(text1, text2) - compare the given strings with the Levenshtein distance algorithm.")
public Long distance(final @Name("text1") String text1, @Name("text2")final String text2) {
if (text1 == null || text2 == null) {
return null;
}
return (long) StringUtils.getLevenshteinDistance(text1, text2);
return (long)levenshteinDistance.apply(text1, text2);
}

@UserFunction
@Description( "apoc.text.levenshteinSimilarity(text1, text2) - calculate the similarity (a value within 0 and 1) between two texts." )
public Double similarity(final @Name("text1") String text1, @Name("text2")final String text2) {
if ( text1 == null || text2 == null ) {
return null;
}

int longerLength = Math.max(text1.length(), text2.length());
if (longerLength == 0) {
return 1.0;
}
long editDistance = distance( text1, text2 );
return (longerLength - editDistance) / (double)longerLength;
}

@UserFunction
@Description( "apoc.text.hammingDistance(text1, text2) - compare the given strings with the Hamming distance algorithm." )
public Long hammingDistance(final @Name("text1") String text1, @Name("text2")final String text2) {
if (text1 == null || text2 == null) {
return null;
}
return (long)hammingDistance.apply(text1, text2) ;
}

@UserFunction
@Description( "apoc.text.jaroWinklerDistance(text1, text2) - compare the given strings with the Jaro-Winkler distance algorithm." )
public Double jaroWinklerDistance(final @Name("text1") String text1, @Name("text2")final String text2) {
if (text1 == null || text2 == null) {
return null;
}
return jaroWinklerDistance.apply(text1, text2);
}

@UserFunction
Expand Down
28 changes: 28 additions & 0 deletions src/test/java/apoc/text/PhoneticTest.java
Expand Up @@ -75,4 +75,32 @@ public void shouldComputeSoundexDifference() {
assertThat(row.get("delta"), equalTo(4L))
);
}

@Test
public void shoudlComputeDoubleMetaphone() {
testCall(db, "CALL apoc.text.doubleMetaphone('Apoc')", (row) ->
assertThat(row.get("value"), equalTo("APK"))
);
}

@Test
public void shoudlComputeDoubleMetaphoneOfNull() {
testCall(db, "CALL apoc.text.doubleMetaphone(NULL)", (row) ->
assertThat(row.get("value"), equalTo(null))
);
}

@Test
public void shoudlComputeDoubleMetaphoneForTheEmptyString() {
testCall(db, "CALL apoc.text.doubleMetaphone('')", (row) ->
assertThat(row.get("value"), equalTo(null))
);
}

@Test
public void shouldComputeDoubleMetaphoneOfManyWords() {
testCall(db, "CALL apoc.text.doubleMetaphone('Hello, dear User!')", (row) ->
assertThat(row.get("value"), equalTo("HLTRASR"))
);
}
}
34 changes: 32 additions & 2 deletions src/test/java/apoc/text/StringsTest.java
Expand Up @@ -190,15 +190,45 @@ public void testCompareCleanedInQuery() throws Exception {
}

@Test
public void testGetLevenshteinDistance() {
public void testLevenshteinDistance() {
String text1 = "Levenshtein";
String text2 = "Levenstein";

testCall(db, "RETURN apoc.text.distance({a}, {b}) as distance",
testCall(db, "RETURN apoc.text.distance({a}, {b}) AS distance",
map("a", text1, "b", text2),
row -> assertEquals(1L, row.get("distance")));
}

@Test
public void testLevenshteinSimilarity() {
String text1 = "Levenshtein";
String text2 = "Levenstein";

testCall(db, "RETURN apoc.text.similarity({a}, {b}) AS similarity",
map("a", text1, "b", text2),
row -> assertEquals(0.9, (double)row.get("similarity"), 0.01));
}

@Test
public void testHammingDistance() {
String text1 = "Neo";
String text2 = "Leo";

testCall(db, "RETURN apoc.text.hammingDistance({a}, {b}) AS distance",
map("a", text1, "b", text2),
row -> assertEquals(1L, row.get("distance")));
}

@Test
public void testJaroWinklerDistance() {
String text1 = "Neo";
String text2 = "Leo";

testCall(db, "RETURN apoc.text.jaroWinklerDistance({a}, {b}) AS distance",
map("a", text1, "b", text2),
row -> assertEquals(0.7777, (double)row.get("distance"), 0.0001));
}

@Test
public void testFuzzyMatch() {
Strings strings = new Strings();
Expand Down

0 comments on commit d69f15e

Please sign in to comment.