diff --git a/similarity/cosine.py b/similarity/cosine.py index 799e598..64f45e0 100644 --- a/similarity/cosine.py +++ b/similarity/cosine.py @@ -24,6 +24,9 @@ from .string_distance import NormalizedStringDistance from .string_similarity import NormalizedStringSimilarity +import re +_SPACE_PATTERN = re.compile("\\s+") + class Cosine(ShingleBased, NormalizedStringDistance, NormalizedStringSimilarity): @@ -41,6 +44,10 @@ def similarity(self, s0, s1): raise TypeError("Argument s1 is NoneType.") if s0 == s1: return 1.0 + + s0 = _SPACE_PATTERN.sub("", s0) + s1 = _SPACE_PATTERN.sub("", s1) + if len(s0) < self.get_k() or len(s1) < self.get_k(): return 0.0 profile0 = self.get_profile(s0) diff --git a/similarity/jaccard.py b/similarity/jaccard.py index a7db3c6..ad6d832 100644 --- a/similarity/jaccard.py +++ b/similarity/jaccard.py @@ -22,6 +22,8 @@ from .string_distance import NormalizedStringDistance, MetricStringDistance from .string_similarity import NormalizedStringSimilarity +import re +_SPACE_PATTERN = re.compile("\\s+") class Jaccard(ShingleBased, MetricStringDistance, NormalizedStringDistance, NormalizedStringSimilarity): @@ -38,6 +40,10 @@ def similarity(self, s0, s1): raise TypeError("Argument s1 is NoneType.") if s0 == s1: return 1.0 + + s0 = _SPACE_PATTERN.sub("", s0) + s1 = _SPACE_PATTERN.sub("", s1) + if len(s0) < self.get_k() or len(s1) < self.get_k(): return 0.0 profile0 = self.get_profile(s0)