From a33f837b4269350205eaf087b16e71325b7b0514 Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Sun, 25 Oct 2020 09:57:33 +0100 Subject: [PATCH 01/21] fix tests for short strings (shorter than q in QGramDistances) --- test/distances.jl | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/test/distances.jl b/test/distances.jl index 42bafae..2c9dae7 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -154,28 +154,23 @@ using StringDistances, Unicode, Test, Random @test totuples(qd3b) == [(["g", "ó"], 1), (["r", "g"], 1), (["w", "ó"], 1), (["ó", "w"], 2)] end - function partlyoverlappingstrings(sizerange, chars = []) - str1 = if length(chars) < 1 - randstring(rand(sizerange)) - else - randstring(chars, rand(sizerange)) - end - elems = collect(str1) - ci1 = prevind(str1, rand(2:div(length(elems), 2))) - ci2 = prevind(str1, rand((ci1+1):(length(elems)-1))) - str2 = if length(chars) < 1 - randstring(ci1-1) * join(elems[ci1:ci2]) * randstring(length(str1)-ci2) - else - randstring(chars, ci1-1) * join(elems[ci1:ci2]) * randstring(chars, length(str1)-ci2) - end - return str1, str2 + function partlyoverlappingstrings(sizerange, chars = nothing) + l = rand(sizerange) + str1 = isnothing(chars) ? randstring(l) : randstring(chars, l) + ci1 = thisind(str1, rand(1:l)) + ci2 = thisind(str1, rand(ci1:l)) + copied = join(str1[ci1:ci2]) + prefix = isnothing(chars) ? randstring(ci1-1) : randstring(chars, ci1-1) + slen = l - length(copied) - length(prefix) + suffix = isnothing(chars) ? randstring(slen) : randstring(chars, slen) + return str1, (prefix * copied * suffix) end @testset "Precalculation on unicode strings" begin Chars = vcat(map(collect, ["δσμΣèìòâôîêûÊÂÛ", 'a':'z', '0':'9'])...) for _ in 1:100 - str1, str2 = partlyoverlappingstrings(10:100, Chars) qlen = rand(2:5) + str1, str2 = partlyoverlappingstrings(6:100, Chars) d = Jaccard(qlen) qd1 = QGramDict(str1, qlen) @@ -196,12 +191,25 @@ using StringDistances, Unicode, Test, Random end end + @testset "QGram distance on short strings" begin + @test isnan(evaluate(Overlap(2), "1", "2")) + @test isnan(evaluate(Jaccard(3), "s1", "s2")) + @test isnan(evaluate(Cosine(5), "s1", "s2")) + + @test !isnan(evaluate(Overlap(2), "s1", "s2")) + @test !isnan(evaluate(Jaccard(3), "st1", "st2")) + @test !isnan(evaluate(Cosine(5), "stri1", "stri2")) + + @test !isnan(evaluate(Jaccard(3), "st1", "str2")) + @test !isnan(evaluate(Jaccard(3), "str1", "st2")) + end + @testset "Differential testing of String, QGramDict, and QGramSortedVector" begin for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap] for _ in 1:100 qlen = rand(2:9) dist = D(qlen) - str1, str2 = partlyoverlappingstrings(5:10000) + str1, str2 = partlyoverlappingstrings(10:10000) # QGramDict gets same result as for standard string qd1 = QGramDict(str1, qlen) From eb6617bae12ea90270b014919bd692ed7f370bab Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Mon, 26 Oct 2020 16:19:27 +0100 Subject: [PATCH 02/21] started adding pairwise with tests --- src/pairwise.jl | 38 ++++++++++++++++++++++++++++++++++++++ test/pairwise.jl | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 src/pairwise.jl create mode 100644 test/pairwise.jl diff --git a/src/pairwise.jl b/src/pairwise.jl new file mode 100644 index 0000000..f7b1fed --- /dev/null +++ b/src/pairwise.jl @@ -0,0 +1,38 @@ +_allocmat(X, Y, T) = Matrix{T}(undef, length(X), length(Y)) +_allocmat(X, T) = Matrix{T}(undef, length(X), length(X)) + +pairwise(dist::StringDistance, X, Y; eltype = Float64) = + pairwise!(_allocmat(X, Y, eltype), dist, X, Y) + +pairwise(dist::StringDistance, X; eltype = Float64) = + pairwise!(_allocmat(X, eltype), dist, X) + +function pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X) where {N<:Number} + if dist isa SemiMetric + _symmetric_pairwise!(R, dist, X) + else + _asymmetric_pairwise!(R, dist, X, X) + end +end + +function pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X, Y) where {N<:Number} + _asymmetric_pairwise!(R, dist, X, Y) +end + +_precalc(X, PT, q) = PT[PT(X[i], q) for i in 1:length(X)] + +const PrecalcMinLength = 5 # Only precalc if length >= 5 + +function _symmetric_pairwise!(R, dist::QGramDistance, X; precalc = nothing, precalcType = QGramSortedVector) + # precalc if set to true or if isnothing and length is at least min length + shouldprecalc = (precalc === true) | (isnothing(precalc) & length(X) >= PrecalcMinLength) + objs = shouldprecalc ? _precalc(X, precalcType, q(dist)) : X + + for i in 1:length(objs) + R[i, i] = 0 + for j in (i+1):length(objs) + R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j]) + end + end + return R +end diff --git a/test/pairwise.jl b/test/pairwise.jl new file mode 100644 index 0000000..68f419e --- /dev/null +++ b/test/pairwise.jl @@ -0,0 +1,42 @@ +using StringDistances, Unicode, Test, Random + +@testset "pairwise" begin + +TestStrings = ["", "abc", "bc", "kitten"] + +@testset "pairwise" begin + for DT in [Levenshtein, Jaro] + d = DT() + R = pairwise(d, TestStrings) + + @test R isa Matrix{Float64, 2} + @test size(R) == (4, 4) + + # No distance on the diagonal, since comparing strings to themselves + @test R[1, 1] == 0.0 + @test R[2, 2] == 0.0 + @test R[3, 3] == 0.0 + @test R[4, 4] == 0.0 + + # First row is comparing "" to the other strings, so: + @test R[1, 2] == evaluate(d, "", "abc") + @test R[1, 3] == evaluate(d, "", "bc") + @test R[1, 4] == evaluate(d, "", "kitten") + + # Second row is comparing "abc" to the other strings, so: + @test R[2, 3] == evaluate(d, "abc", "bc") + @test R[2, 4] == evaluate(d, "abc", "kitten") + + # Third row row is comparing "bc" to the other strings, so: + @test R[3, 4] == evaluate(d, "bc", "kitten") + + # Matrix is symmetric + for i in 1:4 + for j in (i+1):4 + @test R[i, j] == R[j, i] + end + end + end +end + +end From ff1daea00112e006b495f7ec2bfaec7e7abe562a Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Sat, 7 Nov 2020 12:23:10 +0100 Subject: [PATCH 03/21] added pairwise for calculating distance matrices --- src/StringDistances.jl | 5 +-- src/pairwise.jl | 61 +++++++++++++++++++++++------------- test/pairwise.jl | 70 ++++++++++++++++++++++++++++++++++-------- test/runtests.jl | 1 + 4 files changed, 102 insertions(+), 35 deletions(-) diff --git a/src/StringDistances.jl b/src/StringDistances.jl index dc4a215..4da9978 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -11,7 +11,7 @@ const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffOber # Distances API Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", "")) include("find.jl") - +include("pairwise.jl") ############################################################################## ## @@ -42,6 +42,7 @@ compare, result_type, qgrams, normalize, -findnearest +findnearest, +pairwise end diff --git a/src/pairwise.jl b/src/pairwise.jl index f7b1fed..a974fa5 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -1,38 +1,57 @@ -_allocmat(X, Y, T) = Matrix{T}(undef, length(X), length(Y)) -_allocmat(X, T) = Matrix{T}(undef, length(X), length(X)) +_allocmatrix(X, Y, T) = Matrix{T}(undef, length(X), length(Y)) +_allocmatrix(X, T) = Matrix{T}(undef, length(X), length(X)) -pairwise(dist::StringDistance, X, Y; eltype = Float64) = - pairwise!(_allocmat(X, Y, eltype), dist, X, Y) +pairwise(dist::StringDistance, X, Y; eltype = Float64, precalc = nothing) = + pairwise!(_allocmatrix(X, Y, eltype), dist, X, Y; precalc) -pairwise(dist::StringDistance, X; eltype = Float64) = - pairwise!(_allocmat(X, eltype), dist, X) +pairwise(dist::StringDistance, X; eltype = Float64, precalc = nothing) = + pairwise!(_allocmatrix(X, eltype), dist, X; precalc) -function pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X) where {N<:Number} - if dist isa SemiMetric - _symmetric_pairwise!(R, dist, X) - else - _asymmetric_pairwise!(R, dist, X, X) - end -end +pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X; precalc = nothing) where {N<:Number} = + (dist isa SemiMetric) ? + _symmetric_pairwise!(R, dist, X; precalc) : + _asymmetric_pairwise!(R, dist, X, X; precalc) -function pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X, Y) where {N<:Number} - _asymmetric_pairwise!(R, dist, X, Y) -end +pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X, Y; precalc = nothing) where {N<:Number} = + _asymmetric_pairwise!(R, dist, X, Y; precalc) _precalc(X, PT, q) = PT[PT(X[i], q) for i in 1:length(X)] const PrecalcMinLength = 5 # Only precalc if length >= 5 -function _symmetric_pairwise!(R, dist::QGramDistance, X; precalc = nothing, precalcType = QGramSortedVector) - # precalc if set to true or if isnothing and length is at least min length - shouldprecalc = (precalc === true) | (isnothing(precalc) & length(X) >= PrecalcMinLength) - objs = shouldprecalc ? _precalc(X, precalcType, q(dist)) : X +function precalc_if_needed(X, dist::StringDistance, precalc, precalcType) + # precalc only if a QGramDistance and + # if precalc set to true or if isnothing and length is at least min length + !isa(dist, QGramDistance) && return X + cond = (precalc === true) || + (isnothing(precalc) & length(X) >= PrecalcMinLength) + cond ? _precalc(X, precalcType, dist.q) : X +end + +function _symmetric_pairwise!(R, dist::StringDistance, X; + precalc = nothing, precalcType = QGramSortedVector) + + objs = precalc_if_needed(X, dist, precalc, precalcType) for i in 1:length(objs) R[i, i] = 0 - for j in (i+1):length(objs) + Threads.@threads for j in (i+1):length(objs) R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j]) end end return R end + +function _asymmetric_pairwise!(R, dist::StringDistance, X, Y; + precalc = nothing, precalcType = QGramSortedVector) + + objsX = precalc_if_needed(X, dist, precalc, precalcType) + objsY = precalc_if_needed(Y, dist, precalc, precalcType) + + for i in 1:length(objsX) + Threads.@threads for j in 1:length(objsY) + R[i, j] = evaluate(dist, objsX[i], objsY[j]) + end + end + return R +end diff --git a/test/pairwise.jl b/test/pairwise.jl index 68f419e..24ebe2f 100644 --- a/test/pairwise.jl +++ b/test/pairwise.jl @@ -1,15 +1,19 @@ using StringDistances, Unicode, Test, Random +using StringDistances: pairwise, pairwise!, QGramDistance @testset "pairwise" begin -TestStrings = ["", "abc", "bc", "kitten"] +TestStrings1 = ["", "abc", "bc", "kitten"] +TestStrings2 = ["mew", "ab"] @testset "pairwise" begin - for DT in [Levenshtein, Jaro] - d = DT() - R = pairwise(d, TestStrings) + for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, + QGram, Cosine, Jaccard, SorensenDice, Overlap] - @test R isa Matrix{Float64, 2} + d = (DT <: QGramDistance) ? DT(2) : DT() + R = pairwise(d, TestStrings1) + + @test R isa Matrix{Float64} @test size(R) == (4, 4) # No distance on the diagonal, since comparing strings to themselves @@ -18,22 +22,64 @@ TestStrings = ["", "abc", "bc", "kitten"] @test R[3, 3] == 0.0 @test R[4, 4] == 0.0 + # Since the distance might be NaN: + equalorNaN(x, y) = (x == y) || (isnan(x) && isnan(y)) + # First row is comparing "" to the other strings, so: - @test R[1, 2] == evaluate(d, "", "abc") - @test R[1, 3] == evaluate(d, "", "bc") - @test R[1, 4] == evaluate(d, "", "kitten") + @test equalorNaN(R[1, 2], evaluate(d, "", "abc")) + @test equalorNaN(R[1, 3], evaluate(d, "", "bc")) + @test equalorNaN(R[1, 4], evaluate(d, "", "kitten")) # Second row is comparing "abc" to the other strings, so: - @test R[2, 3] == evaluate(d, "abc", "bc") - @test R[2, 4] == evaluate(d, "abc", "kitten") + @test equalorNaN(R[2, 3], evaluate(d, "abc", "bc")) + @test equalorNaN(R[2, 4], evaluate(d, "abc", "kitten")) # Third row row is comparing "bc" to the other strings, so: - @test R[3, 4] == evaluate(d, "bc", "kitten") + @test equalorNaN(R[3, 4], evaluate(d, "bc", "kitten")) # Matrix is symmetric for i in 1:4 for j in (i+1):4 - @test R[i, j] == R[j, i] + @test equalorNaN(R[i, j], R[j, i]) + end + end + + # Test also the assymetric version + R2 = pairwise(d, TestStrings1, TestStrings2) + @test R2 isa Matrix{Float64} + @test size(R2) == (4, 2) + + @test equalorNaN(R2[1, 1], evaluate(d, "", "mew")) + @test equalorNaN(R2[1, 2], evaluate(d, "", "ab")) + + @test equalorNaN(R2[2, 1], evaluate(d, "abc", "mew")) + @test equalorNaN(R2[2, 2], evaluate(d, "abc", "ab")) + + @test equalorNaN(R2[3, 1], evaluate(d, "bc", "mew")) + @test equalorNaN(R2[3, 2], evaluate(d, "bc", "ab")) + + @test equalorNaN(R2[4, 1], evaluate(d, "kitten", "mew")) + @test equalorNaN(R2[4, 2], evaluate(d, "kitten", "ab")) + + R3 = pairwise(d, TestStrings2, TestStrings1) + @test R3 isa Matrix{Float64} + @test size(R3) == (2, 4) + + for i in 1:length(TestStrings1) + for j in 1:length(TestStrings2) + @test equalorNaN(R2[i, j], R3[j, i]) + end + end + + # Ensure same result if precalculating for QGramDistances + if DT <: QGramDistance + R4 = pairwise(d, TestStrings1; precalc = true) + @test typeof(R4) == typeof(R) + @test size(R4) == size(R) + for i in 1:size(R4, 1) + for j in 1:size(R4, 2) + @test equalorNaN(R4[i, j], R[i, j]) + end end end end diff --git a/test/runtests.jl b/test/runtests.jl index 7e391c4..42e8f0a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,3 +3,4 @@ using Test include("distances.jl") include("modifiers.jl") +include("pairwise.jl") \ No newline at end of file From 2ec7c5508ef0fcba0760c7b9bbbb1103f199ae91 Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Sat, 7 Nov 2020 12:45:58 +0100 Subject: [PATCH 04/21] added doc for pairwise --- src/pairwise.jl | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/pairwise.jl b/src/pairwise.jl index a974fa5..8cfa60b 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -1,6 +1,33 @@ _allocmatrix(X, Y, T) = Matrix{T}(undef, length(X), length(Y)) _allocmatrix(X, T) = Matrix{T}(undef, length(X), length(X)) +""" + pairwise(dist::StringDistance, itr; eltype = Float64, precalc = nothing) + pairwise(dist::StringDistance, itr1, itr2; eltype = Float64, precalc = nothing) + +`pairwise` returns the distance matrix between all pairs of elements in `itr` +according to the distance `dist`. The element type of the returned matrix +can be set via `eltype`. For QGramDistances precalculation will be used either +if `precalc` is set to true or if there are more than 5 elements in `itr`. +Set `precalc` to false if no precalculation should be used, regardless of length. + +Both symmetric and asymmetric versions are available. + +### Examples +```julia-repl +julia> using StringDistances +julia> iter = ["New York", "Princeton"] +julia> pairwise(Levenshtein(), iter) # symmetric +2×2 Array{Float64,2}: + 0.0 9.0 + 9.0 0.0 +julia> iter2 = ["San Francisco"] +julia> pairwise(Levenshtein(), iter, iter2) # asymmetric +2×1 Array{Float64,2}: + 12.0 + 10.0 +``` +""" pairwise(dist::StringDistance, X, Y; eltype = Float64, precalc = nothing) = pairwise!(_allocmatrix(X, Y, eltype), dist, X, Y; precalc) From 0e9faea01c41c4b6fd9c6c70003bc8a2112f5805 Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Sat, 7 Nov 2020 12:54:37 +0100 Subject: [PATCH 05/21] import Distances and ensure docstring works for pairwise --- src/pairwise.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/pairwise.jl b/src/pairwise.jl index 8cfa60b..b33ddfe 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -1,7 +1,9 @@ _allocmatrix(X, Y, T) = Matrix{T}(undef, length(X), length(Y)) _allocmatrix(X, T) = Matrix{T}(undef, length(X), length(X)) -""" +import Distances: pairwise + +@doc """ pairwise(dist::StringDistance, itr; eltype = Float64, precalc = nothing) pairwise(dist::StringDistance, itr1, itr2; eltype = Float64, precalc = nothing) @@ -28,6 +30,8 @@ julia> pairwise(Levenshtein(), iter, iter2) # asymmetric 10.0 ``` """ +pairwise + pairwise(dist::StringDistance, X, Y; eltype = Float64, precalc = nothing) = pairwise!(_allocmatrix(X, Y, eltype), dist, X, Y; precalc) From 4840f07109e50a27c2639b8e8bd302818ce45af3 Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Sat, 7 Nov 2020 13:16:44 +0100 Subject: [PATCH 06/21] script for testing performance of pairwise with and without precalculation --- test/performance/pairwise.jl | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 test/performance/pairwise.jl diff --git a/test/performance/pairwise.jl b/test/performance/pairwise.jl new file mode 100644 index 0000000..941b6e8 --- /dev/null +++ b/test/performance/pairwise.jl @@ -0,0 +1,34 @@ +using StringDistances, Random +using BenchmarkTools + +N = if length(ARGS) > 0 + try + parse(Int, ARGS[1]) + catch _ + 100 + end +else + 100 # default value +end + +Maxlength = if length(ARGS) > 1 + try + parse(Int, ARGS[2]) + catch _ + 100 + end +else + 100 # default value +end + +S = String[randstring(rand(3:Maxlength)) for _ in 1:N] + +println("For ", Threads.nthreads(), " threads and ", N, " strings of max length ", Maxlength, ":") + +dist = Cosine(2) +t1 = @belapsed dm1 = pairwise(dist, S; precalc = false) +t2 = @belapsed dm2 = pairwise(dist, S; precalc = true) + +println(" - time WITHOUT pre-calculation: ", round(t1, digits = 3)) +println(" - time WITH pre-calculation: ", round(t2, digits = 3)) +println(" - speedup with pre-calculation: ", round(t1/t2, digits = 1)) From ff22533629ee8648fcfb7a9b1bcb9cd5cda136d4 Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Sat, 7 Nov 2020 13:26:02 +0100 Subject: [PATCH 07/21] fix so works also on Julia 1.3 --- src/pairwise.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/pairwise.jl b/src/pairwise.jl index b33ddfe..a1ff071 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -33,18 +33,18 @@ julia> pairwise(Levenshtein(), iter, iter2) # asymmetric pairwise pairwise(dist::StringDistance, X, Y; eltype = Float64, precalc = nothing) = - pairwise!(_allocmatrix(X, Y, eltype), dist, X, Y; precalc) + pairwise!(_allocmatrix(X, Y, eltype), dist, X, Y; precalc = precalc) pairwise(dist::StringDistance, X; eltype = Float64, precalc = nothing) = - pairwise!(_allocmatrix(X, eltype), dist, X; precalc) + pairwise!(_allocmatrix(X, eltype), dist, X; precalc = precalc) pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X; precalc = nothing) where {N<:Number} = (dist isa SemiMetric) ? - _symmetric_pairwise!(R, dist, X; precalc) : - _asymmetric_pairwise!(R, dist, X, X; precalc) + _symmetric_pairwise!(R, dist, X; precalc = precalc) : + _asymmetric_pairwise!(R, dist, X, X; precalc = precalc) pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X, Y; precalc = nothing) where {N<:Number} = - _asymmetric_pairwise!(R, dist, X, Y; precalc) + _asymmetric_pairwise!(R, dist, X, Y; precalc = precalc) _precalc(X, PT, q) = PT[PT(X[i], q) for i in 1:length(X)] From 5571d0316a91af7cbed87563d56ffc69bbb34a34 Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Sun, 8 Nov 2020 12:03:24 +0100 Subject: [PATCH 08/21] fixes based on Mathieu's comments --- src/pairwise.jl | 57 ++++++++++++++++++------------------ test/pairwise.jl | 4 +-- test/performance/pairwise.jl | 4 +-- 3 files changed, 32 insertions(+), 33 deletions(-) diff --git a/src/pairwise.jl b/src/pairwise.jl index a1ff071..ccdee80 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -1,17 +1,15 @@ _allocmatrix(X, Y, T) = Matrix{T}(undef, length(X), length(Y)) _allocmatrix(X, T) = Matrix{T}(undef, length(X), length(X)) -import Distances: pairwise - @doc """ - pairwise(dist::StringDistance, itr; eltype = Float64, precalc = nothing) - pairwise(dist::StringDistance, itr1, itr2; eltype = Float64, precalc = nothing) + pairwise(dist::StringDistance, itr; eltype = Float64, preprocess = nothing) + pairwise(dist::StringDistance, itr1, itr2; eltype = Float64, preprocess = nothing) `pairwise` returns the distance matrix between all pairs of elements in `itr` -according to the distance `dist`. The element type of the returned matrix -can be set via `eltype`. For QGramDistances precalculation will be used either -if `precalc` is set to true or if there are more than 5 elements in `itr`. -Set `precalc` to false if no precalculation should be used, regardless of length. +according to the `StringDistance` `dist`. The element type of the returned matrix +can be set via `eltype`. For QGramDistances preprocessing will be used either +if `preprocess` is set to true or if there are more than 5 elements in `itr`. +Set `preprocess` to false if no precalculation should be used, regardless of length. Both symmetric and asymmetric versions are available. @@ -32,37 +30,38 @@ julia> pairwise(Levenshtein(), iter, iter2) # asymmetric """ pairwise -pairwise(dist::StringDistance, X, Y; eltype = Float64, precalc = nothing) = - pairwise!(_allocmatrix(X, Y, eltype), dist, X, Y; precalc = precalc) +Distances.pairwise(dist::StringDistance, X, Y; eltype = Float64, preprocess = nothing) = + pairwise!(_allocmatrix(X, Y, eltype), dist, X, Y; preprocess = preprocess) -pairwise(dist::StringDistance, X; eltype = Float64, precalc = nothing) = - pairwise!(_allocmatrix(X, eltype), dist, X; precalc = precalc) +Distances.pairwise(dist::StringDistance, X; eltype = Float64, preprocess = nothing) = + pairwise!(_allocmatrix(X, eltype), dist, X; preprocess = preprocess) -pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X; precalc = nothing) where {N<:Number} = +pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X; preprocess = nothing) where {N<:Number} = (dist isa SemiMetric) ? - _symmetric_pairwise!(R, dist, X; precalc = precalc) : - _asymmetric_pairwise!(R, dist, X, X; precalc = precalc) + _symmetric_pairwise!(R, dist, X; preprocess = preprocess) : + _asymmetric_pairwise!(R, dist, X, X; preprocess = preprocess) -pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X, Y; precalc = nothing) where {N<:Number} = - _asymmetric_pairwise!(R, dist, X, Y; precalc = precalc) +pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X, Y; preprocess = nothing) where {N<:Number} = + _asymmetric_pairwise!(R, dist, X, Y; preprocess = preprocess) -_precalc(X, PT, q) = PT[PT(X[i], q) for i in 1:length(X)] +_preprocess(X, PT, q) = PT[PT(X[i], q) for i in 1:length(X)] const PrecalcMinLength = 5 # Only precalc if length >= 5 -function precalc_if_needed(X, dist::StringDistance, precalc, precalcType) - # precalc only if a QGramDistance and +preprocess_if_needed(X, dist::StringDistance, preprocess, preprocessType) = X + +function preprocess_if_needed(X, dist::QGramDistance, preprocess, preprocessType) + # preprocess only if a QGramDistance and # if precalc set to true or if isnothing and length is at least min length - !isa(dist, QGramDistance) && return X - cond = (precalc === true) || - (isnothing(precalc) & length(X) >= PrecalcMinLength) - cond ? _precalc(X, precalcType, dist.q) : X + cond = (preprocess === true) || + (isnothing(preprocess) && length(X) >= PrecalcMinLength) + cond ? _preprocess(X, preprocessType, dist.q) : X end function _symmetric_pairwise!(R, dist::StringDistance, X; - precalc = nothing, precalcType = QGramSortedVector) + preprocess = nothing, preprocessType = QGramSortedVector) - objs = precalc_if_needed(X, dist, precalc, precalcType) + objs = preprocess_if_needed(X, dist, preprocess, preprocessType) for i in 1:length(objs) R[i, i] = 0 @@ -74,10 +73,10 @@ function _symmetric_pairwise!(R, dist::StringDistance, X; end function _asymmetric_pairwise!(R, dist::StringDistance, X, Y; - precalc = nothing, precalcType = QGramSortedVector) + preprocess = nothing, preprocessType = QGramSortedVector) - objsX = precalc_if_needed(X, dist, precalc, precalcType) - objsY = precalc_if_needed(Y, dist, precalc, precalcType) + objsX = preprocess_if_needed(X, dist, preprocess, preprocessType) + objsY = preprocess_if_needed(Y, dist, preprocess, preprocessType) for i in 1:length(objsX) Threads.@threads for j in 1:length(objsY) diff --git a/test/pairwise.jl b/test/pairwise.jl index 24ebe2f..9a42a47 100644 --- a/test/pairwise.jl +++ b/test/pairwise.jl @@ -71,9 +71,9 @@ TestStrings2 = ["mew", "ab"] end end - # Ensure same result if precalculating for QGramDistances + # Ensure same result if preprocessing for QGramDistances if DT <: QGramDistance - R4 = pairwise(d, TestStrings1; precalc = true) + R4 = pairwise(d, TestStrings1; preprocess = true) @test typeof(R4) == typeof(R) @test size(R4) == size(R) for i in 1:size(R4, 1) diff --git a/test/performance/pairwise.jl b/test/performance/pairwise.jl index 941b6e8..ae3bc26 100644 --- a/test/performance/pairwise.jl +++ b/test/performance/pairwise.jl @@ -26,8 +26,8 @@ S = String[randstring(rand(3:Maxlength)) for _ in 1:N] println("For ", Threads.nthreads(), " threads and ", N, " strings of max length ", Maxlength, ":") dist = Cosine(2) -t1 = @belapsed dm1 = pairwise(dist, S; precalc = false) -t2 = @belapsed dm2 = pairwise(dist, S; precalc = true) +t1 = @belapsed dm1 = pairwise(dist, S; preprocess = false) +t2 = @belapsed dm2 = pairwise(dist, S; preprocess = true) println(" - time WITHOUT pre-calculation: ", round(t1, digits = 3)) println(" - time WITH pre-calculation: ", round(t2, digits = 3)) From 7350e4004f0da11af3df6e3900bec074a082afd8 Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Sun, 8 Nov 2020 15:49:22 +0100 Subject: [PATCH 09/21] try to fix the docstring problem --- src/pairwise.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pairwise.jl b/src/pairwise.jl index ccdee80..fa57db0 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -28,7 +28,7 @@ julia> pairwise(Levenshtein(), iter, iter2) # asymmetric 10.0 ``` """ -pairwise +Distances.pairwise Distances.pairwise(dist::StringDistance, X, Y; eltype = Float64, preprocess = nothing) = pairwise!(_allocmatrix(X, Y, eltype), dist, X, Y; preprocess = preprocess) From 50747cf4c60ccc28dd5742d8a4a1b4f8cb9d7c52 Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Sun, 8 Nov 2020 15:54:41 +0100 Subject: [PATCH 10/21] docstring more in line with the one in Distances --- src/pairwise.jl | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/pairwise.jl b/src/pairwise.jl index fa57db0..174225a 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -5,11 +5,12 @@ _allocmatrix(X, T) = Matrix{T}(undef, length(X), length(X)) pairwise(dist::StringDistance, itr; eltype = Float64, preprocess = nothing) pairwise(dist::StringDistance, itr1, itr2; eltype = Float64, preprocess = nothing) -`pairwise` returns the distance matrix between all pairs of elements in `itr` -according to the `StringDistance` `dist`. The element type of the returned matrix -can be set via `eltype`. For QGramDistances preprocessing will be used either -if `preprocess` is set to true or if there are more than 5 elements in `itr`. -Set `preprocess` to false if no precalculation should be used, regardless of length. +Compute distances between all pairs of elements in `itr`according to the `StringDistance` +`dist`. The element type of the returned distance matrix can be set via `eltype`. + +For QGramDistances preprocessing will be used either if `preprocess` is set to true or +if there are more than 5 elements in `itr`. Set `preprocess` to false if no +preprocessing should be used, regardless of length. Both symmetric and asymmetric versions are available. From 624184382baf3d27a2c9102b4485148c8d20f135 Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Sun, 8 Nov 2020 15:56:09 +0100 Subject: [PATCH 11/21] fixed typo and formatting of docstring --- src/pairwise.jl | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/pairwise.jl b/src/pairwise.jl index 174225a..93bdd20 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -5,12 +5,13 @@ _allocmatrix(X, T) = Matrix{T}(undef, length(X), length(X)) pairwise(dist::StringDistance, itr; eltype = Float64, preprocess = nothing) pairwise(dist::StringDistance, itr1, itr2; eltype = Float64, preprocess = nothing) -Compute distances between all pairs of elements in `itr`according to the `StringDistance` -`dist`. The element type of the returned distance matrix can be set via `eltype`. +Compute distances between all pairs of elements in `itr` according to the +`StringDistance` `dist`. The element type of the returned distance matrix +can be set via `eltype`. -For QGramDistances preprocessing will be used either if `preprocess` is set to true or -if there are more than 5 elements in `itr`. Set `preprocess` to false if no -preprocessing should be used, regardless of length. +For QGramDistances preprocessing will be used either if `preprocess` is set +to true or if there are more than 5 elements in `itr`. Set `preprocess` to +false if no preprocessing should be used, regardless of length. Both symmetric and asymmetric versions are available. From 45ae2d96461f001e1e98d0547b7b1ff34d05ced5 Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Sun, 8 Nov 2020 16:06:52 +0100 Subject: [PATCH 12/21] cache test strings between runs for more comparable results --- test/performance/pairwise.jl | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/test/performance/pairwise.jl b/test/performance/pairwise.jl index ae3bc26..fd9a060 100644 --- a/test/performance/pairwise.jl +++ b/test/performance/pairwise.jl @@ -21,7 +21,31 @@ else 100 # default value end -S = String[randstring(rand(3:Maxlength)) for _ in 1:N] +# If there are strings already cached to disk we start with them and only +# add new ones if needed. +using Serialization +const CacheFile = joinpath(@__DIR__(), "perfteststrings_$(Maxlength).juliabin") +S = if isfile(CacheFile) + try + res = deserialize(CacheFile) + println("Read $(length(res)) strings from cache file: $CacheFile") + res + catch err + String[] + end +else + println("Creating $N random strings.") + String[randstring(rand(3:Maxlength)) for _ in 1:N] +end + +if length(S) < N + for i in (length(S)+1):N + push!(S, randstring(rand(3:Maxlength))) + end + println("Saving cache file with $(length(S)) strings: $CacheFile") + serialize(CacheFile, S) +end + println("For ", Threads.nthreads(), " threads and ", N, " strings of max length ", Maxlength, ":") From 6c5a14e6694518b13cb7d76c829b3fc49d78c83e Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Sun, 8 Nov 2020 16:10:28 +0100 Subject: [PATCH 13/21] ensure test strigns cached also if newly created --- test/performance/pairwise.jl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/performance/pairwise.jl b/test/performance/pairwise.jl index fd9a060..b92eb7c 100644 --- a/test/performance/pairwise.jl +++ b/test/performance/pairwise.jl @@ -25,6 +25,8 @@ end # add new ones if needed. using Serialization const CacheFile = joinpath(@__DIR__(), "perfteststrings_$(Maxlength).juliabin") +SaveCache = false + S = if isfile(CacheFile) try res = deserialize(CacheFile) @@ -36,12 +38,17 @@ S = if isfile(CacheFile) else println("Creating $N random strings.") String[randstring(rand(3:Maxlength)) for _ in 1:N] + SaveCache = true end if length(S) < N for i in (length(S)+1):N push!(S, randstring(rand(3:Maxlength))) end + SaveCache = true +end + +if SaveCache println("Saving cache file with $(length(S)) strings: $CacheFile") serialize(CacheFile, S) end From 9267d07b34e7adc96330bc181913aa70edd46617 Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Sun, 8 Nov 2020 16:11:58 +0100 Subject: [PATCH 14/21] fixed ordering bug in test script --- test/performance/pairwise.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/performance/pairwise.jl b/test/performance/pairwise.jl index b92eb7c..b5d88f6 100644 --- a/test/performance/pairwise.jl +++ b/test/performance/pairwise.jl @@ -37,8 +37,8 @@ S = if isfile(CacheFile) end else println("Creating $N random strings.") - String[randstring(rand(3:Maxlength)) for _ in 1:N] SaveCache = true + String[randstring(rand(3:Maxlength)) for _ in 1:N] end if length(S) < N From a24cd49e16b7eeb24206093f14ef0a2f3534e531 Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Tue, 10 Nov 2020 10:32:34 +0100 Subject: [PATCH 15/21] added the MorisitaOverlap distance which uses the multiplicities of q-grams --- README.md | 1 + src/StringDistances.jl | 1 + src/distances/qgram.jl | 46 +++++++++++++++++++++++++++++++++++++++++- test/distances.jl | 16 +++++++++++++++ 4 files changed, 63 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 22f63ac..1b977e9 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ The available distances are: - [Jaccard Distance](https://en.wikipedia.org/wiki/Jaccard_index) `Jaccard(q::Int)` - [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int)` - [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int)` + - [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int)` - Distance "modifiers" that can be applied to any distance: - [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the normalized distance between the shorter string and substrings of the longer string. - [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the normalized distance of the two strings, after re-ordering words alphabetically. diff --git a/src/StringDistances.jl b/src/StringDistances.jl index 89efc47..ffc8b84 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -35,6 +35,7 @@ Cosine, Jaccard, SorensenDice, Overlap, +MorisitaOverlap, QGramDict, QGramSortedVector, Winkler, diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl index a420877..957d6bd 100755 --- a/src/distances/qgram.jl +++ b/src/distances/qgram.jl @@ -377,4 +377,48 @@ newcounter(d::IntersectionDist) = ThreeCounters{Int, typeof(d)}(0, 0, 0) c.shared += (n1 > 0) & (n2 > 0) calculate(d::Overlap, c::ThreeCounters{Int, Overlap}) = - 1.0 - c.shared / min(c.left, c.right) \ No newline at end of file + 1.0 - c.shared / min(c.left, c.right) + +""" + MorisitaOverlap(q::Int) + +Creates a MorisitaOverlap distance, a general, statistical measure of +dispersion which can also be used on dictionaries such as created +from q-grams. See https://en.wikipedia.org/wiki/Morisita%27s_overlap_index + +The distance corresponds to + +``(2 * sum(m(s1) .* m(s2)) / (sum(m(s1).^2)*M(s2)/M(s1) + sum(m(s2).^2)*M(s1)/M(s2))`` + +where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the +sum of those counts. +""" +struct MorisitaOverlap <: QGramDistance + q::Int +end + +mutable struct FiveCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter + leftsum::T # sum(m(s1)) + rightsum::T # sum(m(s2)) + leftsq::T # sum(m(s1).^2) + rightsq::T # sum(m(s2).^2) + shared::T # sum(m(s1) .* m(s2)) +end + +newcounter(d::MorisitaOverlap) = FiveCounters{Int, MorisitaOverlap}(0, 0, 0, 0, 0) + +@inline function countleft!(c::FiveCounters{Int, MorisitaOverlap}, n1::Integer) + c.leftsum += n1 + c.leftsq += (n1^2) +end + +@inline function countright!(c::FiveCounters{Int, MorisitaOverlap}, n2::Integer) + c.rightsum += n2 + c.rightsq += (n2^2) +end + +@inline countshared!(c::FiveCounters{Int, MorisitaOverlap}, n1::Integer, n2::Integer) = + c.shared += (n1 * n2) + +calculate(d::MorisitaOverlap, c::FiveCounters{Int, MorisitaOverlap}) = + (2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum) diff --git a/test/distances.jl b/test/distances.jl index 7aaa8da..15f00fd 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -130,6 +130,22 @@ using StringDistances, Unicode, Test, Random @test ismissing(evaluate(Overlap(1), "", missing)) end + @testset "MorisitaOverlap" begin + # overlap for 'n', 'h', and 't' and 5 q-grams per string: + @test evaluate(MorisitaOverlap(1), "night", "nacht") == 0.6 # ((2*3)/(5*5/5 + 5*5/5)) + + # overlap for 'o', 'n', 2-overlap for 'c' and 't' and 7 unique q-grams in total so multiplicity vectors + # ms1 = [1, 1, 1, 2, 1, 1, 0] + # ms2 = [2, 1, 1, 2, 0, 0, 1] + # sum(ms1 .* ms2) = 8, sum(ms1 .^ 2) = 9, sum(ms2 .^ 2) = 11, sum(ms1) = 7, sum(ms2) = 7 + @test evaluate(MorisitaOverlap(1), "context", "contact") == 0.8 # ((2*8)/(9*7/7 + 11*7/7)) = 14/18 + @test MorisitaOverlap(1)("context", "contact") == 0.8 + + @test result_type(MorisitaOverlap(1), "hello", "world") == typeof(float(1)) + @inferred evaluate(MorisitaOverlap(1), "", "") + @test ismissing(evaluate(MorisitaOverlap(1), "", missing)) + end + @testset "QGramDict and QGramSortedVector counts qgrams" begin # To get something we can more easily compare to: stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p)) From 36cd8cbfc55a53710af5e23c97e0a5fbe32350b8 Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Tue, 10 Nov 2020 10:35:05 +0100 Subject: [PATCH 16/21] more detailed doc string for MorisitaOverlap --- src/distances/qgram.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl index 957d6bd..7d335d8 100755 --- a/src/distances/qgram.jl +++ b/src/distances/qgram.jl @@ -385,6 +385,9 @@ calculate(d::Overlap, c::ThreeCounters{Int, Overlap}) = Creates a MorisitaOverlap distance, a general, statistical measure of dispersion which can also be used on dictionaries such as created from q-grams. See https://en.wikipedia.org/wiki/Morisita%27s_overlap_index +This is more fine-grained than many of the other QGramDistances since +it is based on the counts per q-gram rather than only which q-grams are +in the strings. The distance corresponds to From 1a6f63206f6a44560f11fa76dbf9b3c3c9b3f00d Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Tue, 10 Nov 2020 10:41:45 +0100 Subject: [PATCH 17/21] fix 'bug' in comment --- test/distances.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/distances.jl b/test/distances.jl index 15f00fd..2d9c94d 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -138,7 +138,7 @@ using StringDistances, Unicode, Test, Random # ms1 = [1, 1, 1, 2, 1, 1, 0] # ms2 = [2, 1, 1, 2, 0, 0, 1] # sum(ms1 .* ms2) = 8, sum(ms1 .^ 2) = 9, sum(ms2 .^ 2) = 11, sum(ms1) = 7, sum(ms2) = 7 - @test evaluate(MorisitaOverlap(1), "context", "contact") == 0.8 # ((2*8)/(9*7/7 + 11*7/7)) = 14/18 + @test evaluate(MorisitaOverlap(1), "context", "contact") == 0.8 # ((2*8)/(9*7/7 + 11*7/7)) = 16/20 @test MorisitaOverlap(1)("context", "contact") == 0.8 @test result_type(MorisitaOverlap(1), "hello", "world") == typeof(float(1)) From 9417c89f1ced2ff3e379c7d9539096abfb083cb5 Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Tue, 10 Nov 2020 10:54:22 +0100 Subject: [PATCH 18/21] added MorisitaOverlap test also for 2-gram --- test/distances.jl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/distances.jl b/test/distances.jl index 2d9c94d..2c8cef9 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -141,6 +141,12 @@ using StringDistances, Unicode, Test, Random @test evaluate(MorisitaOverlap(1), "context", "contact") == 0.8 # ((2*8)/(9*7/7 + 11*7/7)) = 16/20 @test MorisitaOverlap(1)("context", "contact") == 0.8 + # Multiplicity vectors for 2-grams "co", "on", "nt", "te", "ex", "xt", "ta", "ac", "ct" + # ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0] + # ms2 = [1, 1, 1, 0, 0, 0, 1, 1, 1] + # sum(ms1 .* ms2) = 3, sum(ms1 .^ 2) = 6, sum(ms2 .^ 2) = 6, sum(ms1) = 6, sum(ms2) = 6 + @test MorisitaOverlap(2)("context", "contact") == 0.5 # ((2*3)/(6*6/6 + 6*6/6)) + @test result_type(MorisitaOverlap(1), "hello", "world") == typeof(float(1)) @inferred evaluate(MorisitaOverlap(1), "", "") @test ismissing(evaluate(MorisitaOverlap(1), "", missing)) From 5cc500053fa17c125c7a4cec4004958d28b0a1de Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Tue, 10 Nov 2020 11:11:03 +0100 Subject: [PATCH 19/21] also test MorisitaOverlap with preprocessing --- test/distances.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/distances.jl b/test/distances.jl index 2c8cef9..4d06064 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -234,7 +234,7 @@ using StringDistances, Unicode, Test, Random end @testset "Differential testing of String, QGramDict, and QGramSortedVector" begin - for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap] + for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap, MorisitaOverlap] for _ in 1:100 qlen = rand(2:9) dist = D(qlen) From 919a78aa08525f380cb119863ff5fa5a1fc1fcc6 Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Tue, 10 Nov 2020 16:57:30 +0100 Subject: [PATCH 20/21] fix bug in MorisitaOverlap and added NormalizedMultisetDistance (NMD) --- README.md | 1 + src/StringDistances.jl | 2 ++ src/distances/qgram.jl | 43 +++++++++++++++++++++++++++++++++++++++++- test/distances.jl | 28 ++++++++++++++++++++++----- 4 files changed, 68 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 1b977e9..721e033 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ The available distances are: - [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int)` - [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int)` - [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int)` + - [NormalizedMultisetDistance](https://www.sciencedirect.com/science/article/pii/S1047320313001417) `NormalizedMultisetDistance(q::Int)` or `NMD(q::Int)` - Distance "modifiers" that can be applied to any distance: - [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the normalized distance between the shorter string and substrings of the longer string. - [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the normalized distance of the two strings, after re-ordering words alphabetically. diff --git a/src/StringDistances.jl b/src/StringDistances.jl index ffc8b84..be59d7b 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -36,6 +36,8 @@ Jaccard, SorensenDice, Overlap, MorisitaOverlap, +NormalizedMultisetDistance, +NMD, QGramDict, QGramSortedVector, Winkler, diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl index 7d335d8..b16d124 100755 --- a/src/distances/qgram.jl +++ b/src/distances/qgram.jl @@ -424,4 +424,45 @@ end c.shared += (n1 * n2) calculate(d::MorisitaOverlap, c::FiveCounters{Int, MorisitaOverlap}) = - (2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum) + 1.0 - ((2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum)) + +""" + NormalizedMultisetDistance(q::Int) + NMD(q::Int) + +Creates a NormalizedMultisetDistance (NMD) distance as introduced by Besiris and +Zigouris 2013. +See https://www.sciencedirect.com/science/article/pii/S1047320313001417 + +The distance corresponds to + +``(sum(max.(m(s1), m(s2)) - min(M(s1), M(s2))) / max(M(s1), M(s2))`` + +where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the +sum of those counts. +""" +struct NormalizedMultisetDistance <: QGramDistance + q::Int +end +const NMD = NormalizedMultisetDistance # frequently used acronym + +newcounter(d::NMD) = ThreeCounters{Int, NMD}(0, 0, 0) + +@inline function countleft!(c::ThreeCounters{Int, NMD}, n1::Integer) + c.left += n1 + c.shared += n1 # max(n1, 0) == n1 +end + +@inline function countright!(c::ThreeCounters{Int, NMD}, n2::Integer) + c.right += n2 + c.shared += n2 # max(n2, 0) == n2 +end + +@inline function countboth!(c::ThreeCounters{Int, NMD}, n1::Integer, n2::Integer) + c.left += n1 + c.right += n2 + c.shared += max(n1, n2) +end + +calculate(d::NMD, c::ThreeCounters{Int, NMD}) = + (c.shared - min(c.left, c.right)) / max(c.left, c.right) diff --git a/test/distances.jl b/test/distances.jl index 4d06064..003a452 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -132,26 +132,44 @@ using StringDistances, Unicode, Test, Random @testset "MorisitaOverlap" begin # overlap for 'n', 'h', and 't' and 5 q-grams per string: - @test evaluate(MorisitaOverlap(1), "night", "nacht") == 0.6 # ((2*3)/(5*5/5 + 5*5/5)) + @test evaluate(MorisitaOverlap(1), "night", "nacht") == 0.4 # 1.0-((2*3)/(5*5/5 + 5*5/5)) # overlap for 'o', 'n', 2-overlap for 'c' and 't' and 7 unique q-grams in total so multiplicity vectors # ms1 = [1, 1, 1, 2, 1, 1, 0] # ms2 = [2, 1, 1, 2, 0, 0, 1] # sum(ms1 .* ms2) = 8, sum(ms1 .^ 2) = 9, sum(ms2 .^ 2) = 11, sum(ms1) = 7, sum(ms2) = 7 - @test evaluate(MorisitaOverlap(1), "context", "contact") == 0.8 # ((2*8)/(9*7/7 + 11*7/7)) = 16/20 - @test MorisitaOverlap(1)("context", "contact") == 0.8 + @test evaluate(MorisitaOverlap(1), "context", "contact") ≈ .2 atol = 1e-4 # 1.0-((2*8)/(9*7/7 + 11*7/7)) = 16/20 + @test MorisitaOverlap(1)("context", "contact") ≈ .2 atol = 1e-4 # Multiplicity vectors for 2-grams "co", "on", "nt", "te", "ex", "xt", "ta", "ac", "ct" # ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0] # ms2 = [1, 1, 1, 0, 0, 0, 1, 1, 1] # sum(ms1 .* ms2) = 3, sum(ms1 .^ 2) = 6, sum(ms2 .^ 2) = 6, sum(ms1) = 6, sum(ms2) = 6 - @test MorisitaOverlap(2)("context", "contact") == 0.5 # ((2*3)/(6*6/6 + 6*6/6)) + @test MorisitaOverlap(2)("context", "contact") == 0.5 # 1.0-((2*3)/(6*6/6 + 6*6/6)) @test result_type(MorisitaOverlap(1), "hello", "world") == typeof(float(1)) @inferred evaluate(MorisitaOverlap(1), "", "") @test ismissing(evaluate(MorisitaOverlap(1), "", missing)) end + @testset "NMD" begin + # m(s1) = [1, 1, 1, 1, 1, 0, 0], m(s2) = [1, 0, 0, 1, 1, 1, 1] + @test evaluate(NMD(1), "night", "nacht") == 0.4 # (7-5)/5 + + # ms1 = [1, 1, 1, 2, 1, 1, 0] + # ms2 = [2, 1, 1, 2, 0, 0, 1] + @test evaluate(NMD(1), "context", "contact") ≈ 0.2857 atol = 1e-4 # ((2+1+1+2+1+1+1)-7)/(7) + @test NMD(1)("context", "contact") ≈ 0.2857 atol = 1e-4 + + # ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0] + # ms2 = [1, 1, 1, 0, 0, 0, 1, 1, 1] + @test NMD(2)("context", "contact") == 0.5 # ((1+1+1+1+1+1+1+1+1)-6)/6 + + @test result_type(NMD(1), "hello", "world") == typeof(float(1)) + @inferred evaluate(NMD(1), "", "") + @test ismissing(evaluate(NMD(1), "", missing)) + end + @testset "QGramDict and QGramSortedVector counts qgrams" begin # To get something we can more easily compare to: stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p)) @@ -234,7 +252,7 @@ using StringDistances, Unicode, Test, Random end @testset "Differential testing of String, QGramDict, and QGramSortedVector" begin - for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap, MorisitaOverlap] + for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap, MorisitaOverlap, NMD] for _ in 1:100 qlen = rand(2:9) dist = D(qlen) From d1c283a517da8aec835f5f8eb365b0dbd5b8e659 Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Tue, 10 Nov 2020 17:10:36 +0100 Subject: [PATCH 21/21] better docstring for NMD --- src/distances/qgram.jl | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl index b16d124..5efd87d 100755 --- a/src/distances/qgram.jl +++ b/src/distances/qgram.jl @@ -431,8 +431,9 @@ calculate(d::MorisitaOverlap, c::FiveCounters{Int, MorisitaOverlap}) = NMD(q::Int) Creates a NormalizedMultisetDistance (NMD) distance as introduced by Besiris and -Zigouris 2013. -See https://www.sciencedirect.com/science/article/pii/S1047320313001417 +Zigouris 2013. The goal with this distance is to behave similarly to a normalized +compression distance without having to do any actual compression (and thus being +faster to compute). The distance corresponds to @@ -440,6 +441,9 @@ The distance corresponds to where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the sum of those counts. + +For details see: +https://www.sciencedirect.com/science/article/pii/S1047320313001417 """ struct NormalizedMultisetDistance <: QGramDistance q::Int