Skip to content

Commit

Permalink
Merge cd727e5 into f4185fb
Browse files Browse the repository at this point in the history
  • Loading branch information
robertfeldt committed Nov 10, 2020
2 parents f4185fb + cd727e5 commit dd0436f
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 6 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ The available distances are:
- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int)`
- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int)`
- [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int)`
- [NormalizedMultisetDistance](https://www.sciencedirect.com/science/article/pii/S1047320313001417) `NormalizedMultisetDistance(q::Int)` or `NMD(q::Int)`
- Distance "modifiers" that can be applied to any distance:
- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the normalized distance between the shorter string and substrings of the longer string.
- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the normalized distance of the two strings, after re-ordering words alphabetically.
Expand Down
2 changes: 2 additions & 0 deletions src/StringDistances.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ Jaccard,
SorensenDice,
Overlap,
MorisitaOverlap,
NormalizedMultisetDistance,
NMD,
QGramDict,
QGramSortedVector,
Winkler,
Expand Down
47 changes: 46 additions & 1 deletion src/distances/qgram.jl
Original file line number Diff line number Diff line change
Expand Up @@ -424,4 +424,49 @@ end
c.shared += (n1 * n2)

calculate(d::MorisitaOverlap, c::FiveCounters{Int, MorisitaOverlap}) =
(2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum)
1.0 - ((2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum))

"""
NormalizedMultisetDistance(q::Int)
NMD(q::Int)
Creates a NormalizedMultisetDistance (NMD) distance as introduced by Besiris and
Zigouris 2013. The goal with this distance is to behave similarly to a normalized
compression distance without having to do any actual compression (and thus being
faster to compute).
The distance corresponds to
``(sum(max.(m(s1), m(s2)) - min(M(s1), M(s2))) / max(M(s1), M(s2))``
where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
sum of those counts.
For details see:
https://www.sciencedirect.com/science/article/pii/S1047320313001417
"""
struct NormalizedMultisetDistance <: QGramDistance
q::Int
end
const NMD = NormalizedMultisetDistance # frequently used acronym

newcounter(d::NMD) = ThreeCounters{Int, NMD}(0, 0, 0)

@inline function countleft!(c::ThreeCounters{Int, NMD}, n1::Integer)
c.left += n1
c.shared += n1 # max(n1, 0) == n1
end

@inline function countright!(c::ThreeCounters{Int, NMD}, n2::Integer)
c.right += n2
c.shared += n2 # max(n2, 0) == n2
end

@inline function countboth!(c::ThreeCounters{Int, NMD}, n1::Integer, n2::Integer)
c.left += n1
c.right += n2
c.shared += max(n1, n2)
end

calculate(d::NMD, c::ThreeCounters{Int, NMD}) =
(c.shared - min(c.left, c.right)) / max(c.left, c.right)
28 changes: 23 additions & 5 deletions test/distances.jl
Original file line number Diff line number Diff line change
Expand Up @@ -132,26 +132,44 @@ using StringDistances, Unicode, Test, Random

@testset "MorisitaOverlap" begin
# overlap for 'n', 'h', and 't' and 5 q-grams per string:
@test evaluate(MorisitaOverlap(1), "night", "nacht") == 0.6 # ((2*3)/(5*5/5 + 5*5/5))
@test evaluate(MorisitaOverlap(1), "night", "nacht") == 0.4 # 1.0-((2*3)/(5*5/5 + 5*5/5))

# overlap for 'o', 'n', 2-overlap for 'c' and 't' and 7 unique q-grams in total so multiplicity vectors
# ms1 = [1, 1, 1, 2, 1, 1, 0]
# ms2 = [2, 1, 1, 2, 0, 0, 1]
# sum(ms1 .* ms2) = 8, sum(ms1 .^ 2) = 9, sum(ms2 .^ 2) = 11, sum(ms1) = 7, sum(ms2) = 7
@test evaluate(MorisitaOverlap(1), "context", "contact") == 0.8 # ((2*8)/(9*7/7 + 11*7/7)) = 16/20
@test MorisitaOverlap(1)("context", "contact") == 0.8
@test evaluate(MorisitaOverlap(1), "context", "contact") .2 atol = 1e-4 # 1.0-((2*8)/(9*7/7 + 11*7/7)) = 16/20
@test MorisitaOverlap(1)("context", "contact") .2 atol = 1e-4

# Multiplicity vectors for 2-grams "co", "on", "nt", "te", "ex", "xt", "ta", "ac", "ct"
# ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0]
# ms2 = [1, 1, 1, 0, 0, 0, 1, 1, 1]
# sum(ms1 .* ms2) = 3, sum(ms1 .^ 2) = 6, sum(ms2 .^ 2) = 6, sum(ms1) = 6, sum(ms2) = 6
@test MorisitaOverlap(2)("context", "contact") == 0.5 # ((2*3)/(6*6/6 + 6*6/6))
@test MorisitaOverlap(2)("context", "contact") == 0.5 # 1.0-((2*3)/(6*6/6 + 6*6/6))

@test result_type(MorisitaOverlap(1), "hello", "world") == typeof(float(1))
@inferred evaluate(MorisitaOverlap(1), "", "")
@test ismissing(evaluate(MorisitaOverlap(1), "", missing))
end

@testset "NMD" begin
# m(s1) = [1, 1, 1, 1, 1, 0, 0], m(s2) = [1, 0, 0, 1, 1, 1, 1]
@test evaluate(NMD(1), "night", "nacht") == 0.4 # (7-5)/5

# ms1 = [1, 1, 1, 2, 1, 1, 0]
# ms2 = [2, 1, 1, 2, 0, 0, 1]
@test evaluate(NMD(1), "context", "contact") 0.2857 atol = 1e-4 # ((2+1+1+2+1+1+1)-7)/(7)
@test NMD(1)("context", "contact") 0.2857 atol = 1e-4

# ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0]
# ms2 = [1, 1, 1, 0, 0, 0, 1, 1, 1]
@test NMD(2)("context", "contact") == 0.5 # ((1+1+1+1+1+1+1+1+1)-6)/6

@test result_type(NMD(1), "hello", "world") == typeof(float(1))
@inferred evaluate(NMD(1), "", "")
@test ismissing(evaluate(NMD(1), "", missing))
end

@testset "QGramDict and QGramSortedVector counts qgrams" begin
# To get something we can more easily compare to:
stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p))
Expand Down Expand Up @@ -234,7 +252,7 @@ using StringDistances, Unicode, Test, Random
end

@testset "Differential testing of String, QGramDict, and QGramSortedVector" begin
for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap, MorisitaOverlap]
for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap, MorisitaOverlap, NMD]
for _ in 1:100
qlen = rand(2:9)
dist = D(qlen)
Expand Down

0 comments on commit dd0436f

Please sign in to comment.