diff --git a/src/StringDistances.jl b/src/StringDistances.jl index f8705fe..89efc47 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -7,10 +7,11 @@ include("distances/edit.jl") include("distances/qgram.jl") include("normalize.jl") -const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize} +const StringDistance = Union{Hamming, Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize} # Distances API Distances.result_type(dist::StringDistance, s1::Type, s2::Type) = typeof(dist("", "")) -Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, eltype(s1), eltype(s2)) +Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2)) + include("find.jl") include("pairwise.jl") @@ -23,10 +24,12 @@ include("pairwise.jl") export StringDistance, +Hamming, Levenshtein, DamerauLevenshtein, Jaro, RatcliffObershelp, +QGramDistance, QGram, Cosine, Jaccard, diff --git a/src/distances/edit.jl b/src/distances/edit.jl index c2c6e06..bfe2842 100755 --- a/src/distances/edit.jl +++ b/src/distances/edit.jl @@ -1,3 +1,27 @@ +""" + Hamming() + +Creates the Hamming distance + +The Hamming distance is defined as the number of characters that do not match +""" +struct Hamming{V <: Union{Integer, Nothing}} <: SemiMetric + max_dist::V +end +Hamming() = Hamming(nothing) + +function (dist::Hamming)(s1, s2) + ((s1 === missing) | (s2 === missing)) && return missing + current = abs(length(s2) - length(s1)) + dist.max_dist !== nothing && current > dist.max_dist && return dist.max_dist + 1 + for (ch1, ch2) in zip(s1, s2) + current += ch1 != ch2 + dist.max_dist !== nothing && current > dist.max_dist && return dist.max_dist + 1 + end + return current +end + + """ Jaro() diff --git a/src/normalize.jl b/src/normalize.jl index 88cea90..299ae6c 100755 --- a/src/normalize.jl +++ b/src/normalize.jl @@ -11,6 +11,14 @@ end normalize(dist::SemiMetric, max_dist = 1.0) = Normalize(dist) normalize(dist::Normalize, max_dist = 1.0) = Normalize(dist.dist) +function (dist::Normalize{<:Hamming})(s1, s2, max_dist = 1.0) + ((s1 === missing) | (s2 === missing)) && return missing + s1, s2 = reorder(s1, s2) + len1, len2 = length(s1), length(s2) + len2 == 0 && return 1.0 + out = dist.dist(s1, s2) / len2 + out > max_dist ? 1.0 : out +end # A normalized distance is between 0 and 1, and accept a third argument, max_dist. function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0) diff --git a/src/pairwise.jl b/src/pairwise.jl index 1ab1699..3a91118 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -1,12 +1,12 @@ @doc """ - pairwise(dist::StringDistance, itr; preprocess = nothing) - pairwise(dist::StringDistance, itr1, itr2; preprocess = nothing) + pairwise(dist::StringDistance, xs::AbstractVector; preprocess = nothing) + pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing) -Compute distances between all pairs of elements in `itr` according to the +Compute distances between all pairs of elements in `xs` and `ys` according to the `StringDistance` `dist`. For QGramDistances preprocessing will be used either if `preprocess` is set -to true or if there are more than 5 elements in `itr`. Set `preprocess` to +to true or if there are more than 5 elements in `xs`. Set `preprocess` to false if no preprocessing should be used, regardless of length. Both symmetric and asymmetric versions are available. @@ -28,55 +28,66 @@ julia> pairwise(Levenshtein(), iter, iter2) # asymmetric """ Distances.pairwise -function Distances.pairwise(dist::StringDistance, X, Y; preprocess = length(X) >= 5) - T = result_type(dist, eltype(X), eltype(Y)) - R = Matrix{T}(undef, length(X), length(Y)) - pairwise!(R, dist, X, Y; preprocess = preprocess) +function Distances.pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing) + T = result_type(dist, eltype(xs), eltype(ys)) + if Missing <: Union{eltype(xs), eltype(ys)} + T = Union{T, Missing} + end + R = Matrix{T}(undef, length(xs), length(ys)) + pairwise!(R, dist, xs, ys; preprocess = preprocess) end -function Distances.pairwise(dist::StringDistance, X; preprocess = nothing) - T = result_type(dist, eltype(X), eltype(X)) - R = Matrix{T}(undef, length(X), length(X)) - pairwise!(R, dist, X; preprocess = preprocess) +function Distances.pairwise(dist::StringDistance, xs::AbstractVector; preprocess = nothing) + T = result_type(dist, eltype(xs), eltype(xs)) + if Missing <: eltype(xs) + T = Union{T, Missing} + end + R = Matrix{T}(undef, length(xs), length(xs)) + pairwise!(R, dist, xs; preprocess = preprocess) end @doc """ - pairwise!(r::AbstractMatrix, dist::StringDistance, itr; preprocess = nothing) - pairwise!(r::AbstractMatrix, dist::StringDistance, itr1, itr2; preprocess = nothing) + pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing) + pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing) -Compute distances between all pairs of elements in `itr` according to the -`StringDistance` `dist` and write the result in `r`. +Compute distances between all pairs of elements in `xs` and `ys` according to the +`StringDistance` `dist` and write the result in `R`. For QGramDistances preprocessing will be used either if `preprocess` is set -to true or if there are more than 5 elements in `itr`. Set `preprocess` to +to true or if there are more than 5 elements in `xs`. Set `preprocess` to false if no preprocessing should be used, regardless of length. """ Distances.pairwise! -function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, X, Y; preprocess = nothing) - _asymmetric_pairwise!(R, dist, X, Y; preprocess = preprocess) +function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing) + length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length")) + length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length")) + _asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess) end -function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, X; preprocess = nothing) +function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing) + length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length")) + length(xs) == size(R, 2) || throw(DimensionMismatch("inconsistent length")) (dist isa SemiMetric) ? - _symmetric_pairwise!(R, dist, X; preprocess = preprocess) : - _asymmetric_pairwise!(R, dist, X, X; preprocess = preprocess) + _symmetric_pairwise!(R, dist, xs; preprocess = preprocess) : + _asymmetric_pairwise!(R, dist, xs, xs; preprocess = preprocess) end -function _preprocess(X, dist::QGramDistance, preprocess) - if (preprocess === true) || (isnothing(preprocess) && length(X) >= 5) - return map(x -> QGramSortedVector(x, dist.q), X) +function _preprocess(xs, dist::QGramDistance, preprocess) + if (preprocess === true) || (isnothing(preprocess) && length(xs) >= 5) + return map(x -> x === missing ? x : QGramSortedVector(x, dist.q), xs) else - return X + return xs end end -_preprocess(X, dist::StringDistance, preprocess) = X +_preprocess(xs, dist::StringDistance, preprocess) = xs -function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, X; preprocess = nothing) - objs = _preprocess(X, dist, preprocess) +function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing) + objs = _preprocess(xs, dist, preprocess) for i in 1:length(objs) - R[i, i] = 0 + # handle missing + R[i, i] = objs[i] != objs[i] Threads.@threads for j in (i+1):length(objs) R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j]) end @@ -84,12 +95,12 @@ function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, X; prepro return R end -function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, X, Y; preprocess = nothing) - objsX = _preprocess(X, dist, preprocess) - objsY = _preprocess(Y, dist, preprocess) - for i in 1:length(objsX) - Threads.@threads for j in 1:length(objsY) - R[i, j] = evaluate(dist, objsX[i], objsY[j]) +function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing) + objsxs = _preprocess(xs, dist, preprocess) + objsys = _preprocess(ys, dist, preprocess) + for i in 1:length(objsxs) + Threads.@threads for j in 1:length(objsys) + R[i, j] = evaluate(dist, objsxs[i], objsys[j]) end end return R diff --git a/test/distances.jl b/test/distances.jl index 1fc878b..7aaa8da 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -1,6 +1,13 @@ using StringDistances, Unicode, Test, Random @testset "Distances" begin + @testset "Hamming" begin + @test evaluate(Hamming(), "martha", "marhta") ≈ 2 + @test evaluate(Hamming(), "es an ", " vs an") ≈ 6 + @test evaluate(Hamming(), [1, 2, 3], [1,2, 4]) ≈ 1 + @inferred evaluate(Hamming(), "", "") + @test ismissing(evaluate(Hamming(), "", missing)) + end @testset "Jaro" begin @test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547 diff --git a/test/pairwise.jl b/test/pairwise.jl index 12c3ce6..94e61ca 100644 --- a/test/pairwise.jl +++ b/test/pairwise.jl @@ -1,11 +1,12 @@ using StringDistances, Unicode, Test, Random -using StringDistances: pairwise, pairwise!, QGramDistance - @testset "pairwise" begin TestStrings1 = ["", "abc", "bc", "kitten"] TestStrings2 = ["mew", "ab"] +TestStrings1missing = ["", "abc", "bc", missing] +TestStrings2missing = ["mew", missing] + @testset "pairwise" begin for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGram, Cosine, Jaccard, SorensenDice, Overlap] @@ -79,6 +80,9 @@ TestStrings2 = ["mew", "ab"] end end end + # ensures missing + R5 = pairwise(d, TestStrings1missing; preprocess = true) + @test eltype(R5) == Union{result_type(d, String, String), Missing} end end