From 9668e4a682332251e9b4f20184ce684c72e502f8 Mon Sep 17 00:00:00 2001 From: mamonu Date: Wed, 10 Nov 2021 14:07:32 +0000 Subject: [PATCH 1/3] taking py jc out for the moment --- splink/jar_fallback.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/splink/jar_fallback.py b/splink/jar_fallback.py index 0957da193a..ec22f00f65 100644 --- a/splink/jar_fallback.py +++ b/splink/jar_fallback.py @@ -1,27 +1,6 @@ import math -def jc_sim_py(str1, str2): - """ - Jaccard`similarity calculated exactly as in stringutils.similarity jaccard in Apache Commons - """ - - if not str1 or not str2: - return 0.0 - - k = 2 # default k in stringutil is 2 so leaving it like that for compatibility - - # break strings into sets of rolling k-char syllables - a = set([str1[i : i + 1] for i in range(len(str1) - k + 1)]) - b = set([str2[i : i + 1] for i in range(len(str2) - k + 1)]) - - # calculate instersection of two sets - c = a.intersection(b) - - # return Jaccard similarity - return float(len(c)) / (len(a) + len(b) - len(c)) - - def jw_sim_py( first, second, # modification from original to not use other imput parameters From c8e2709f9fed2f7de7a51a8ea8f9713635103a54 Mon Sep 17 00:00:00 2001 From: mamonu Date: Wed, 10 Nov 2021 14:10:09 +0000 Subject: [PATCH 2/3] taking py jc tests out for the moment --- tests/test_jar_fallback.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/test_jar_fallback.py b/tests/test_jar_fallback.py index 048ef94ed6..57a8d5a9ad 100644 --- a/tests/test_jar_fallback.py +++ b/tests/test_jar_fallback.py @@ -8,13 +8,12 @@ def test_fallback_jw_nodata(): assert jw_sim_py(None, "Something") == 0.0 -def test_fallback_jc_nodata(): - assert jc_sim_py(None, None) == 0.0 - assert jc_sim_py("something", None) == 0.0 - assert jc_sim_py(None, "Something") == 0.0 - - def test_fallback_jw_wikipedia_examples(): + """ + tests from Apache Commons jarowinkler similarity available at: + https://github.com/apache/commons-text/blob/master/src/test/java/org/apache/commons/text/similarity/JaroWinklerSimilarityTest.java + + """ assert jw_sim_py("fly", "ant") == 0.0 assert jw_sim_py("elephant", "hippo") == 0.44 assert jw_sim_py("ABC Corporation", "ABC Corp") == 0.91 From 89d46e88f255be75fe744b43619dd2a777e54fcd Mon Sep 17 00:00:00 2001 From: mamonu Date: Wed, 10 Nov 2021 14:15:04 +0000 Subject: [PATCH 3/3] taking py jc tests out for the moment --- tests/test_jar_fallback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_jar_fallback.py b/tests/test_jar_fallback.py index 57a8d5a9ad..d1d281e421 100644 --- a/tests/test_jar_fallback.py +++ b/tests/test_jar_fallback.py @@ -1,5 +1,5 @@ import pytest -from splink.jar_fallback import jw_sim_py, jc_sim_py +from splink.jar_fallback import jw_sim_py def test_fallback_jw_nodata():