Skip to content

Commit

Permalink
add Hamming + restrict pairwise to vectors + handle missings
Browse files Browse the repository at this point in the history
  • Loading branch information
matthieugomez committed Nov 10, 2020
1 parent b407b18 commit e409568
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 40 deletions.
7 changes: 5 additions & 2 deletions src/StringDistances.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@ include("distances/edit.jl")
include("distances/qgram.jl")
include("normalize.jl")

const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
const StringDistance = Union{Hamming, Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
# Distances API
Distances.result_type(dist::StringDistance, s1::Type, s2::Type) = typeof(dist("", ""))
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, eltype(s1), eltype(s2))
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))


include("find.jl")
include("pairwise.jl")
Expand All @@ -23,10 +24,12 @@ include("pairwise.jl")

export
StringDistance,
Hamming,
Levenshtein,
DamerauLevenshtein,
Jaro,
RatcliffObershelp,
QGramDistance,
QGram,
Cosine,
Jaccard,
Expand Down
24 changes: 24 additions & 0 deletions src/distances/edit.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,27 @@
"""
Hamming()
Creates the Hamming distance
The Hamming distance is defined as the number of characters that do not match
"""
struct Hamming{V <: Union{Integer, Nothing}} <: SemiMetric
max_dist::V
end
Hamming() = Hamming(nothing)

function (dist::Hamming)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
current = abs(length(s2) - length(s1))
dist.max_dist !== nothing && current > dist.max_dist && return dist.max_dist + 1
for (ch1, ch2) in zip(s1, s2)
current += ch1 != ch2
dist.max_dist !== nothing && current > dist.max_dist && return dist.max_dist + 1
end
return current
end


"""
Jaro()
Expand Down
8 changes: 8 additions & 0 deletions src/normalize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@ end
normalize(dist::SemiMetric, max_dist = 1.0) = Normalize(dist)
normalize(dist::Normalize, max_dist = 1.0) = Normalize(dist.dist)

function (dist::Normalize{<:Hamming})(s1, s2, max_dist = 1.0)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len2 == 0 && return 1.0
out = dist.dist(s1, s2) / len2
out > max_dist ? 1.0 : out
end

# A normalized distance is between 0 and 1, and accept a third argument, max_dist.
function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
Expand Down
83 changes: 47 additions & 36 deletions src/pairwise.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
@doc """
pairwise(dist::StringDistance, itr; preprocess = nothing)
pairwise(dist::StringDistance, itr1, itr2; preprocess = nothing)
pairwise(dist::StringDistance, xs::AbstractVector; preprocess = nothing)
pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
Compute distances between all pairs of elements in `itr` according to the
Compute distances between all pairs of elements in `xs` and `ys` according to the
`StringDistance` `dist`.
For QGramDistances preprocessing will be used either if `preprocess` is set
to true or if there are more than 5 elements in `itr`. Set `preprocess` to
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
false if no preprocessing should be used, regardless of length.
Both symmetric and asymmetric versions are available.
Expand All @@ -28,68 +28,79 @@ julia> pairwise(Levenshtein(), iter, iter2) # asymmetric
"""
Distances.pairwise

function Distances.pairwise(dist::StringDistance, X, Y; preprocess = length(X) >= 5)
T = result_type(dist, eltype(X), eltype(Y))
R = Matrix{T}(undef, length(X), length(Y))
pairwise!(R, dist, X, Y; preprocess = preprocess)
function Distances.pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
T = result_type(dist, eltype(xs), eltype(ys))
if Missing <: Union{eltype(xs), eltype(ys)}
T = Union{T, Missing}
end
R = Matrix{T}(undef, length(xs), length(ys))
pairwise!(R, dist, xs, ys; preprocess = preprocess)
end

function Distances.pairwise(dist::StringDistance, X; preprocess = nothing)
T = result_type(dist, eltype(X), eltype(X))
R = Matrix{T}(undef, length(X), length(X))
pairwise!(R, dist, X; preprocess = preprocess)
function Distances.pairwise(dist::StringDistance, xs::AbstractVector; preprocess = nothing)
T = result_type(dist, eltype(xs), eltype(xs))
if Missing <: eltype(xs)
T = Union{T, Missing}
end
R = Matrix{T}(undef, length(xs), length(xs))
pairwise!(R, dist, xs; preprocess = preprocess)
end

@doc """
pairwise!(r::AbstractMatrix, dist::StringDistance, itr; preprocess = nothing)
pairwise!(r::AbstractMatrix, dist::StringDistance, itr1, itr2; preprocess = nothing)
pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
Compute distances between all pairs of elements in `itr` according to the
`StringDistance` `dist` and write the result in `r`.
Compute distances between all pairs of elements in `xs` and `ys` according to the
`StringDistance` `dist` and write the result in `R`.
For QGramDistances preprocessing will be used either if `preprocess` is set
to true or if there are more than 5 elements in `itr`. Set `preprocess` to
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
false if no preprocessing should be used, regardless of length.
"""
Distances.pairwise!

function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, X, Y; preprocess = nothing)
_asymmetric_pairwise!(R, dist, X, Y; preprocess = preprocess)
function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
_asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
end

function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, X; preprocess = nothing)
function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
length(xs) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
(dist isa SemiMetric) ?
_symmetric_pairwise!(R, dist, X; preprocess = preprocess) :
_asymmetric_pairwise!(R, dist, X, X; preprocess = preprocess)
_symmetric_pairwise!(R, dist, xs; preprocess = preprocess) :
_asymmetric_pairwise!(R, dist, xs, xs; preprocess = preprocess)
end

function _preprocess(X, dist::QGramDistance, preprocess)
if (preprocess === true) || (isnothing(preprocess) && length(X) >= 5)
return map(x -> QGramSortedVector(x, dist.q), X)
function _preprocess(xs, dist::QGramDistance, preprocess)
if (preprocess === true) || (isnothing(preprocess) && length(xs) >= 5)
return map(x -> x === missing ? x : QGramSortedVector(x, dist.q), xs)
else
return X
return xs
end
end
_preprocess(X, dist::StringDistance, preprocess) = X
_preprocess(xs, dist::StringDistance, preprocess) = xs


function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, X; preprocess = nothing)
objs = _preprocess(X, dist, preprocess)
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
objs = _preprocess(xs, dist, preprocess)
for i in 1:length(objs)
R[i, i] = 0
# handle missing
R[i, i] = objs[i] != objs[i]
Threads.@threads for j in (i+1):length(objs)
R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
end
end
return R
end

function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, X, Y; preprocess = nothing)
objsX = _preprocess(X, dist, preprocess)
objsY = _preprocess(Y, dist, preprocess)
for i in 1:length(objsX)
Threads.@threads for j in 1:length(objsY)
R[i, j] = evaluate(dist, objsX[i], objsY[j])
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
objsxs = _preprocess(xs, dist, preprocess)
objsys = _preprocess(ys, dist, preprocess)
for i in 1:length(objsxs)
Threads.@threads for j in 1:length(objsys)
R[i, j] = evaluate(dist, objsxs[i], objsys[j])
end
end
return R
Expand Down
7 changes: 7 additions & 0 deletions test/distances.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
using StringDistances, Unicode, Test, Random

@testset "Distances" begin
@testset "Hamming" begin
@test evaluate(Hamming(), "martha", "marhta") 2
@test evaluate(Hamming(), "es an ", " vs an") 6
@test evaluate(Hamming(), [1, 2, 3], [1,2, 4]) 1
@inferred evaluate(Hamming(), "", "")
@test ismissing(evaluate(Hamming(), "", missing))
end

@testset "Jaro" begin
@test evaluate(Jaro(), "martha", "marhta") 0.05555555555555547
Expand Down
8 changes: 6 additions & 2 deletions test/pairwise.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
using StringDistances, Unicode, Test, Random
using StringDistances: pairwise, pairwise!, QGramDistance

@testset "pairwise" begin

TestStrings1 = ["", "abc", "bc", "kitten"]
TestStrings2 = ["mew", "ab"]

TestStrings1missing = ["", "abc", "bc", missing]
TestStrings2missing = ["mew", missing]

@testset "pairwise" begin
for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp,
QGram, Cosine, Jaccard, SorensenDice, Overlap]
Expand Down Expand Up @@ -79,6 +80,9 @@ TestStrings2 = ["mew", "ab"]
end
end
end
# ensures missing
R5 = pairwise(d, TestStrings1missing; preprocess = true)
@test eltype(R5) == Union{result_type(d, String, String), Missing}
end
end

Expand Down

0 comments on commit e409568

Please sign in to comment.