Skip to content

Commit

Permalink
Merge 89d46e8 into 6cec2cc
Browse files Browse the repository at this point in the history
  • Loading branch information
mamonu committed Nov 10, 2021
2 parents 6cec2cc + 89d46e8 commit d355540
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 28 deletions.
21 changes: 0 additions & 21 deletions splink/jar_fallback.py
@@ -1,27 +1,6 @@
import math


def jc_sim_py(str1, str2):
"""
Jaccard`similarity calculated exactly as in stringutils.similarity jaccard in Apache Commons
"""

if not str1 or not str2:
return 0.0

k = 2 # default k in stringutil is 2 so leaving it like that for compatibility

# break strings into sets of rolling k-char syllables
a = set([str1[i : i + 1] for i in range(len(str1) - k + 1)])
b = set([str2[i : i + 1] for i in range(len(str2) - k + 1)])

# calculate instersection of two sets
c = a.intersection(b)

# return Jaccard similarity
return float(len(c)) / (len(a) + len(b) - len(c))


def jw_sim_py(
first,
second, # modification from original to not use other imput parameters
Expand Down
13 changes: 6 additions & 7 deletions tests/test_jar_fallback.py
@@ -1,5 +1,5 @@
import pytest
from splink.jar_fallback import jw_sim_py, jc_sim_py
from splink.jar_fallback import jw_sim_py


def test_fallback_jw_nodata():
Expand All @@ -8,13 +8,12 @@ def test_fallback_jw_nodata():
assert jw_sim_py(None, "Something") == 0.0


def test_fallback_jc_nodata():
assert jc_sim_py(None, None) == 0.0
assert jc_sim_py("something", None) == 0.0
assert jc_sim_py(None, "Something") == 0.0


def test_fallback_jw_wikipedia_examples():
"""
tests from Apache Commons jarowinkler similarity available at:
https://github.com/apache/commons-text/blob/master/src/test/java/org/apache/commons/text/similarity/JaroWinklerSimilarityTest.java
"""
assert jw_sim_py("fly", "ant") == 0.0
assert jw_sim_py("elephant", "hippo") == 0.44
assert jw_sim_py("ABC Corporation", "ABC Corp") == 0.91
Expand Down

0 comments on commit d355540

Please sign in to comment.