From a33f837b4269350205eaf087b16e71325b7b0514 Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Sun, 25 Oct 2020 09:57:33 +0100
Subject: [PATCH 01/21] fix tests for short strings (shorter than q in
 QGramDistances)

---
 test/distances.jl | 42 +++++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/test/distances.jl b/test/distances.jl
index 42bafae..2c9dae7 100644
--- a/test/distances.jl
+++ b/test/distances.jl
@@ -154,28 +154,23 @@ using StringDistances, Unicode, Test, Random
 		@test totuples(qd3b) == [(["g", "ó"], 1), (["r", "g"], 1), (["w", "ó"], 1), (["ó", "w"], 2)]
 	end
 
-	function partlyoverlappingstrings(sizerange, chars = [])
-		str1 = if length(chars) < 1
-			randstring(rand(sizerange))
-		else
-			randstring(chars, rand(sizerange))
-		end
-		elems = collect(str1)
-		ci1 = prevind(str1, rand(2:div(length(elems), 2)))
-		ci2 = prevind(str1, rand((ci1+1):(length(elems)-1)))
-		str2 = if length(chars) < 1
-			randstring(ci1-1) * join(elems[ci1:ci2]) * randstring(length(str1)-ci2)
-		else
-			randstring(chars, ci1-1) * join(elems[ci1:ci2]) * randstring(chars, length(str1)-ci2)
-		end
-		return str1, str2
+	function partlyoverlappingstrings(sizerange, chars = nothing)
+		l = rand(sizerange)
+		str1 = isnothing(chars) ? randstring(l) : randstring(chars, l)
+		ci1 = thisind(str1, rand(1:l))
+		ci2 = thisind(str1, rand(ci1:l))
+		copied = join(str1[ci1:ci2])
+		prefix = isnothing(chars) ? randstring(ci1-1) : randstring(chars, ci1-1)
+		slen = l - length(copied) - length(prefix)
+		suffix = isnothing(chars) ? randstring(slen) : randstring(chars, slen)
+		return str1, (prefix * copied * suffix)
 	end
 
 	@testset "Precalculation on unicode strings" begin
 		Chars = vcat(map(collect, ["δσμΣèìòâôîêûÊÂÛ", 'a':'z', '0':'9'])...)
 		for _ in 1:100
-			str1, str2 = partlyoverlappingstrings(10:100, Chars)
 			qlen = rand(2:5)
+			str1, str2 = partlyoverlappingstrings(6:100, Chars)
 			d = Jaccard(qlen)
 
 			qd1 = QGramDict(str1, qlen)
@@ -196,12 +191,25 @@ using StringDistances, Unicode, Test, Random
 		end
 	end
 
+	@testset "QGram distance on short strings" begin
+		@test isnan(evaluate(Overlap(2),  "1",  "2"))
+		@test isnan(evaluate(Jaccard(3), "s1", "s2"))
+		@test isnan(evaluate(Cosine(5),  "s1", "s2"))
+
+		@test !isnan(evaluate(Overlap(2),  "s1",  "s2"))
+		@test !isnan(evaluate(Jaccard(3), "st1", "st2"))
+		@test !isnan(evaluate(Cosine(5),  "stri1", "stri2"))
+
+		@test !isnan(evaluate(Jaccard(3), "st1", "str2"))
+		@test !isnan(evaluate(Jaccard(3), "str1", "st2"))
+	end
+
 	@testset "Differential testing of String, QGramDict, and QGramSortedVector" begin
 		for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap]
 			for _ in 1:100
 				qlen = rand(2:9)
 				dist = D(qlen)
-				str1, str2 = partlyoverlappingstrings(5:10000)
+				str1, str2 = partlyoverlappingstrings(10:10000)
 
 				# QGramDict gets same result as for standard string
 				qd1 = QGramDict(str1, qlen)

From eb6617bae12ea90270b014919bd692ed7f370bab Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Mon, 26 Oct 2020 16:19:27 +0100
Subject: [PATCH 02/21] started adding pairwise with tests

---
 src/pairwise.jl  | 38 ++++++++++++++++++++++++++++++++++++++
 test/pairwise.jl | 42 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+)
 create mode 100644 src/pairwise.jl
 create mode 100644 test/pairwise.jl

diff --git a/src/pairwise.jl b/src/pairwise.jl
new file mode 100644
index 0000000..f7b1fed
--- /dev/null
+++ b/src/pairwise.jl
@@ -0,0 +1,38 @@
+_allocmat(X, Y, T) = Matrix{T}(undef, length(X), length(Y))
+_allocmat(X, T) = Matrix{T}(undef, length(X), length(X))
+
+pairwise(dist::StringDistance, X, Y; eltype = Float64) = 
+    pairwise!(_allocmat(X, Y, eltype), dist, X, Y)
+
+pairwise(dist::StringDistance, X; eltype = Float64) = 
+    pairwise!(_allocmat(X, eltype), dist, X)
+
+function pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X) where {N<:Number}
+    if dist isa SemiMetric
+        _symmetric_pairwise!(R, dist, X)
+    else
+        _asymmetric_pairwise!(R, dist, X, X)
+    end
+end
+
+function pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X, Y) where {N<:Number}
+    _asymmetric_pairwise!(R, dist, X, Y)
+end
+
+_precalc(X, PT, q) = PT[PT(X[i], q) for i in 1:length(X)]
+
+const PrecalcMinLength = 5 # Only precalc if length >= 5
+
+function _symmetric_pairwise!(R, dist::QGramDistance, X; precalc = nothing, precalcType = QGramSortedVector)
+    # precalc if set to true or if isnothing and length is at least min length
+    shouldprecalc = (precalc === true) | (isnothing(precalc) & length(X) >= PrecalcMinLength)
+    objs = shouldprecalc ? _precalc(X, precalcType, q(dist)) : X
+
+    for i in 1:length(objs)
+        R[i, i] = 0
+        for j in (i+1):length(objs)
+            R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
+        end
+    end
+    return R
+end
diff --git a/test/pairwise.jl b/test/pairwise.jl
new file mode 100644
index 0000000..68f419e
--- /dev/null
+++ b/test/pairwise.jl
@@ -0,0 +1,42 @@
+using StringDistances, Unicode, Test, Random
+
+@testset "pairwise" begin
+
+TestStrings = ["", "abc", "bc", "kitten"]
+
+@testset "pairwise" begin
+	for DT in [Levenshtein, Jaro]
+		d = DT()
+		R = pairwise(d, TestStrings)
+
+		@test R isa Matrix{Float64, 2}
+		@test size(R) == (4, 4)
+
+		# No distance on the diagonal, since comparing strings to themselves
+		@test R[1, 1] == 0.0
+		@test R[2, 2] == 0.0
+		@test R[3, 3] == 0.0
+		@test R[4, 4] == 0.0
+
+		# First row is comparing "" to the other strings, so:
+		@test R[1, 2] == evaluate(d, "", "abc")
+		@test R[1, 3] == evaluate(d, "", "bc")
+		@test R[1, 4] == evaluate(d, "", "kitten")
+
+		# Second row is comparing "abc" to the other strings, so:
+		@test R[2, 3] == evaluate(d, "abc", "bc")
+		@test R[2, 4] == evaluate(d, "abc", "kitten")
+
+		# Third row row is comparing "bc" to the other strings, so:
+		@test R[3, 4] == evaluate(d, "bc", "kitten")
+
+		# Matrix is symmetric
+		for i in 1:4
+			for j in (i+1):4
+				@test R[i, j] == R[j, i]
+			end
+		end
+	end
+end
+
+end

From ff1daea00112e006b495f7ec2bfaec7e7abe562a Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Sat, 7 Nov 2020 12:23:10 +0100
Subject: [PATCH 03/21] added pairwise for calculating distance matrices

---
 src/StringDistances.jl |  5 +--
 src/pairwise.jl        | 61 +++++++++++++++++++++++-------------
 test/pairwise.jl       | 70 ++++++++++++++++++++++++++++++++++--------
 test/runtests.jl       |  1 +
 4 files changed, 102 insertions(+), 35 deletions(-)

diff --git a/src/StringDistances.jl b/src/StringDistances.jl
index dc4a215..4da9978 100755
--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@@ -11,7 +11,7 @@ const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffOber
 # Distances API
 Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", ""))
 include("find.jl")
-
+include("pairwise.jl")
 
 ##############################################################################
 ##
@@ -42,6 +42,7 @@ compare,
 result_type,
 qgrams,
 normalize,
-findnearest
+findnearest,
+pairwise
 end
 
diff --git a/src/pairwise.jl b/src/pairwise.jl
index f7b1fed..a974fa5 100644
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@@ -1,38 +1,57 @@
-_allocmat(X, Y, T) = Matrix{T}(undef, length(X), length(Y))
-_allocmat(X, T) = Matrix{T}(undef, length(X), length(X))
+_allocmatrix(X, Y, T) = Matrix{T}(undef, length(X), length(Y))
+_allocmatrix(X, T) = Matrix{T}(undef, length(X), length(X))
 
-pairwise(dist::StringDistance, X, Y; eltype = Float64) = 
-    pairwise!(_allocmat(X, Y, eltype), dist, X, Y)
+pairwise(dist::StringDistance, X, Y; eltype = Float64, precalc = nothing) =
+    pairwise!(_allocmatrix(X, Y, eltype), dist, X, Y; precalc)
 
-pairwise(dist::StringDistance, X; eltype = Float64) = 
-    pairwise!(_allocmat(X, eltype), dist, X)
+pairwise(dist::StringDistance, X; eltype = Float64, precalc = nothing) =
+    pairwise!(_allocmatrix(X, eltype), dist, X; precalc)
 
-function pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X) where {N<:Number}
-    if dist isa SemiMetric
-        _symmetric_pairwise!(R, dist, X)
-    else
-        _asymmetric_pairwise!(R, dist, X, X)
-    end
-end
+pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X; precalc = nothing) where {N<:Number} =
+    (dist isa SemiMetric) ?
+        _symmetric_pairwise!(R, dist, X; precalc) :
+        _asymmetric_pairwise!(R, dist, X, X; precalc)
 
-function pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X, Y) where {N<:Number}
-    _asymmetric_pairwise!(R, dist, X, Y)
-end
+pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X, Y; precalc = nothing) where {N<:Number} =
+    _asymmetric_pairwise!(R, dist, X, Y; precalc)
 
 _precalc(X, PT, q) = PT[PT(X[i], q) for i in 1:length(X)]
 
 const PrecalcMinLength = 5 # Only precalc if length >= 5
 
-function _symmetric_pairwise!(R, dist::QGramDistance, X; precalc = nothing, precalcType = QGramSortedVector)
-    # precalc if set to true or if isnothing and length is at least min length
-    shouldprecalc = (precalc === true) | (isnothing(precalc) & length(X) >= PrecalcMinLength)
-    objs = shouldprecalc ? _precalc(X, precalcType, q(dist)) : X
+function precalc_if_needed(X, dist::StringDistance, precalc, precalcType)
+    # precalc only if a QGramDistance and
+    # if precalc set to true or if isnothing and length is at least min length
+    !isa(dist, QGramDistance) && return X
+    cond = (precalc === true) ||
+                (isnothing(precalc) & length(X) >= PrecalcMinLength)
+    cond ? _precalc(X, precalcType, dist.q) : X
+end
+
+function _symmetric_pairwise!(R, dist::StringDistance, X;
+    precalc = nothing, precalcType = QGramSortedVector)
+
+    objs = precalc_if_needed(X, dist, precalc, precalcType)
 
     for i in 1:length(objs)
         R[i, i] = 0
-        for j in (i+1):length(objs)
+        Threads.@threads for j in (i+1):length(objs)
             R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
         end
     end
     return R
 end
+
+function _asymmetric_pairwise!(R, dist::StringDistance, X, Y;
+    precalc = nothing, precalcType = QGramSortedVector)
+
+    objsX = precalc_if_needed(X, dist, precalc, precalcType)
+    objsY = precalc_if_needed(Y, dist, precalc, precalcType)
+
+    for i in 1:length(objsX)
+        Threads.@threads for j in 1:length(objsY)
+            R[i, j] = evaluate(dist, objsX[i], objsY[j])
+        end
+    end
+    return R
+end
diff --git a/test/pairwise.jl b/test/pairwise.jl
index 68f419e..24ebe2f 100644
--- a/test/pairwise.jl
+++ b/test/pairwise.jl
@@ -1,15 +1,19 @@
 using StringDistances, Unicode, Test, Random
+using StringDistances: pairwise, pairwise!, QGramDistance
 
 @testset "pairwise" begin
 
-TestStrings = ["", "abc", "bc", "kitten"]
+TestStrings1 = ["", "abc", "bc", "kitten"]
+TestStrings2 = ["mew", "ab"]
 
 @testset "pairwise" begin
-	for DT in [Levenshtein, Jaro]
-		d = DT()
-		R = pairwise(d, TestStrings)
+	for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp,
+				QGram, Cosine, Jaccard, SorensenDice, Overlap]
 
-		@test R isa Matrix{Float64, 2}
+		d = (DT <: QGramDistance) ? DT(2) : DT()
+		R = pairwise(d, TestStrings1)
+
+		@test R isa Matrix{Float64}
 		@test size(R) == (4, 4)
 
 		# No distance on the diagonal, since comparing strings to themselves
@@ -18,22 +22,64 @@ TestStrings = ["", "abc", "bc", "kitten"]
 		@test R[3, 3] == 0.0
 		@test R[4, 4] == 0.0
 
+		# Since the distance might be NaN:
+		equalorNaN(x, y) = (x == y) || (isnan(x) && isnan(y))
+
 		# First row is comparing "" to the other strings, so:
-		@test R[1, 2] == evaluate(d, "", "abc")
-		@test R[1, 3] == evaluate(d, "", "bc")
-		@test R[1, 4] == evaluate(d, "", "kitten")
+		@test equalorNaN(R[1, 2], evaluate(d, "", "abc"))
+		@test equalorNaN(R[1, 3], evaluate(d, "", "bc"))
+		@test equalorNaN(R[1, 4], evaluate(d, "", "kitten"))
 
 		# Second row is comparing "abc" to the other strings, so:
-		@test R[2, 3] == evaluate(d, "abc", "bc")
-		@test R[2, 4] == evaluate(d, "abc", "kitten")
+		@test equalorNaN(R[2, 3], evaluate(d, "abc", "bc"))
+		@test equalorNaN(R[2, 4], evaluate(d, "abc", "kitten"))
 
 		# Third row row is comparing "bc" to the other strings, so:
-		@test R[3, 4] == evaluate(d, "bc", "kitten")
+		@test equalorNaN(R[3, 4], evaluate(d, "bc", "kitten"))
 
 		# Matrix is symmetric
 		for i in 1:4
 			for j in (i+1):4
-				@test R[i, j] == R[j, i]
+				@test equalorNaN(R[i, j], R[j, i])
+			end
+		end
+
+		# Test also the assymetric version
+		R2 = pairwise(d, TestStrings1, TestStrings2)
+		@test R2 isa Matrix{Float64}
+		@test size(R2) == (4, 2)
+
+		@test equalorNaN(R2[1, 1], evaluate(d, "", "mew"))
+		@test equalorNaN(R2[1, 2], evaluate(d, "", "ab"))
+
+		@test equalorNaN(R2[2, 1], evaluate(d, "abc", "mew"))
+		@test equalorNaN(R2[2, 2], evaluate(d, "abc", "ab"))
+
+		@test equalorNaN(R2[3, 1], evaluate(d, "bc", "mew"))
+		@test equalorNaN(R2[3, 2], evaluate(d, "bc", "ab"))
+
+		@test equalorNaN(R2[4, 1], evaluate(d, "kitten", "mew"))
+		@test equalorNaN(R2[4, 2], evaluate(d, "kitten", "ab"))
+
+		R3 = pairwise(d, TestStrings2, TestStrings1)
+		@test R3 isa Matrix{Float64}
+		@test size(R3) == (2, 4)
+
+		for i in 1:length(TestStrings1)
+			for j in 1:length(TestStrings2)
+				@test equalorNaN(R2[i, j], R3[j, i])
+			end
+		end
+
+		# Ensure same result if precalculating for QGramDistances
+		if DT <: QGramDistance
+			R4 = pairwise(d, TestStrings1; precalc = true)
+			@test typeof(R4) == typeof(R)
+			@test size(R4) == size(R)
+			for i in 1:size(R4, 1)
+				for j in 1:size(R4, 2)
+					@test equalorNaN(R4[i, j], R[i, j])
+				end
 			end
 		end
 	end
diff --git a/test/runtests.jl b/test/runtests.jl
index 7e391c4..42e8f0a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,3 +3,4 @@ using Test
 
 include("distances.jl")
 include("modifiers.jl")
+include("pairwise.jl")
\ No newline at end of file

From 2ec7c5508ef0fcba0760c7b9bbbb1103f199ae91 Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Sat, 7 Nov 2020 12:45:58 +0100
Subject: [PATCH 04/21] added doc for pairwise

---
 src/pairwise.jl | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/src/pairwise.jl b/src/pairwise.jl
index a974fa5..8cfa60b 100644
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@@ -1,6 +1,33 @@
 _allocmatrix(X, Y, T) = Matrix{T}(undef, length(X), length(Y))
 _allocmatrix(X, T) = Matrix{T}(undef, length(X), length(X))
 
+"""
+    pairwise(dist::StringDistance, itr; eltype = Float64, precalc = nothing)
+    pairwise(dist::StringDistance, itr1, itr2; eltype = Float64, precalc = nothing)
+
+`pairwise` returns the distance matrix between all pairs of elements in `itr`
+according to the distance `dist`. The element type of the returned matrix
+can be set via `eltype`. For QGramDistances precalculation will be used either
+if `precalc` is set to true or if there are more than 5 elements in `itr`.
+Set `precalc` to false if no precalculation should be used, regardless of length.
+
+Both symmetric and asymmetric versions are available.
+
+### Examples
+```julia-repl
+julia> using StringDistances
+julia> iter = ["New York", "Princeton"]
+julia> pairwise(Levenshtein(), iter) # symmetric
+2×2 Array{Float64,2}:
+ 0.0  9.0
+ 9.0  0.0
+julia> iter2 = ["San Francisco"]
+julia> pairwise(Levenshtein(), iter, iter2) # asymmetric
+2×1 Array{Float64,2}:
+ 12.0
+ 10.0
+```
+"""
 pairwise(dist::StringDistance, X, Y; eltype = Float64, precalc = nothing) =
     pairwise!(_allocmatrix(X, Y, eltype), dist, X, Y; precalc)
 

From 0e9faea01c41c4b6fd9c6c70003bc8a2112f5805 Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Sat, 7 Nov 2020 12:54:37 +0100
Subject: [PATCH 05/21] import Distances and ensure docstring works for
 pairwise

---
 src/pairwise.jl | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/pairwise.jl b/src/pairwise.jl
index 8cfa60b..b33ddfe 100644
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@@ -1,7 +1,9 @@
 _allocmatrix(X, Y, T) = Matrix{T}(undef, length(X), length(Y))
 _allocmatrix(X, T) = Matrix{T}(undef, length(X), length(X))
 
-"""
+import Distances: pairwise
+
+@doc """
     pairwise(dist::StringDistance, itr; eltype = Float64, precalc = nothing)
     pairwise(dist::StringDistance, itr1, itr2; eltype = Float64, precalc = nothing)
 
@@ -28,6 +30,8 @@ julia> pairwise(Levenshtein(), iter, iter2) # asymmetric
  10.0
 ```
 """
+pairwise
+
 pairwise(dist::StringDistance, X, Y; eltype = Float64, precalc = nothing) =
     pairwise!(_allocmatrix(X, Y, eltype), dist, X, Y; precalc)
 

From 4840f07109e50a27c2639b8e8bd302818ce45af3 Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Sat, 7 Nov 2020 13:16:44 +0100
Subject: [PATCH 06/21] script for testing performance of pairwise with and
 without precalculation

---
 test/performance/pairwise.jl | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 test/performance/pairwise.jl

diff --git a/test/performance/pairwise.jl b/test/performance/pairwise.jl
new file mode 100644
index 0000000..941b6e8
--- /dev/null
+++ b/test/performance/pairwise.jl
@@ -0,0 +1,34 @@
+using StringDistances, Random
+using BenchmarkTools
+
+N = if length(ARGS) > 0
+    try
+        parse(Int, ARGS[1])
+    catch _
+        100
+    end
+else
+    100 # default value
+end
+
+Maxlength = if length(ARGS) > 1
+    try
+        parse(Int, ARGS[2])
+    catch _
+        100
+    end
+else
+    100 # default value
+end
+
+S = String[randstring(rand(3:Maxlength)) for _ in 1:N]
+
+println("For ", Threads.nthreads(), " threads and ", N, " strings of max length ", Maxlength, ":")
+
+dist = Cosine(2)
+t1 = @belapsed dm1 = pairwise(dist, S; precalc = false)
+t2 = @belapsed dm2 = pairwise(dist, S; precalc = true)
+
+println("  - time WITHOUT pre-calculation: ", round(t1, digits = 3))
+println("  - time WITH    pre-calculation: ", round(t2, digits = 3))
+println("  - speedup with pre-calculation: ", round(t1/t2, digits = 1))

From ff22533629ee8648fcfb7a9b1bcb9cd5cda136d4 Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Sat, 7 Nov 2020 13:26:02 +0100
Subject: [PATCH 07/21] fix so works also on Julia 1.3

---
 src/pairwise.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/pairwise.jl b/src/pairwise.jl
index b33ddfe..a1ff071 100644
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@@ -33,18 +33,18 @@ julia> pairwise(Levenshtein(), iter, iter2) # asymmetric
 pairwise
 
 pairwise(dist::StringDistance, X, Y; eltype = Float64, precalc = nothing) =
-    pairwise!(_allocmatrix(X, Y, eltype), dist, X, Y; precalc)
+    pairwise!(_allocmatrix(X, Y, eltype), dist, X, Y; precalc = precalc)
 
 pairwise(dist::StringDistance, X; eltype = Float64, precalc = nothing) =
-    pairwise!(_allocmatrix(X, eltype), dist, X; precalc)
+    pairwise!(_allocmatrix(X, eltype), dist, X; precalc = precalc)
 
 pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X; precalc = nothing) where {N<:Number} =
     (dist isa SemiMetric) ?
-        _symmetric_pairwise!(R, dist, X; precalc) :
-        _asymmetric_pairwise!(R, dist, X, X; precalc)
+        _symmetric_pairwise!(R, dist, X; precalc = precalc) :
+        _asymmetric_pairwise!(R, dist, X, X; precalc = precalc)
 
 pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X, Y; precalc = nothing) where {N<:Number} =
-    _asymmetric_pairwise!(R, dist, X, Y; precalc)
+    _asymmetric_pairwise!(R, dist, X, Y; precalc = precalc)
 
 _precalc(X, PT, q) = PT[PT(X[i], q) for i in 1:length(X)]
 

From 5571d0316a91af7cbed87563d56ffc69bbb34a34 Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Sun, 8 Nov 2020 12:03:24 +0100
Subject: [PATCH 08/21] fixes based on Mathieu's comments

---
 src/pairwise.jl              | 57 ++++++++++++++++++------------------
 test/pairwise.jl             |  4 +--
 test/performance/pairwise.jl |  4 +--
 3 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/src/pairwise.jl b/src/pairwise.jl
index a1ff071..ccdee80 100644
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@@ -1,17 +1,15 @@
 _allocmatrix(X, Y, T) = Matrix{T}(undef, length(X), length(Y))
 _allocmatrix(X, T) = Matrix{T}(undef, length(X), length(X))
 
-import Distances: pairwise
-
 @doc """
-    pairwise(dist::StringDistance, itr; eltype = Float64, precalc = nothing)
-    pairwise(dist::StringDistance, itr1, itr2; eltype = Float64, precalc = nothing)
+    pairwise(dist::StringDistance, itr; eltype = Float64, preprocess = nothing)
+    pairwise(dist::StringDistance, itr1, itr2; eltype = Float64, preprocess = nothing)
 
 `pairwise` returns the distance matrix between all pairs of elements in `itr`
-according to the distance `dist`. The element type of the returned matrix
-can be set via `eltype`. For QGramDistances precalculation will be used either
-if `precalc` is set to true or if there are more than 5 elements in `itr`.
-Set `precalc` to false if no precalculation should be used, regardless of length.
+according to the `StringDistance` `dist`. The element type of the returned matrix
+can be set via `eltype`. For QGramDistances preprocessing will be used either
+if `preprocess` is set to true or if there are more than 5 elements in `itr`.
+Set `preprocess` to false if no precalculation should be used, regardless of length.
 
 Both symmetric and asymmetric versions are available.
 
@@ -32,37 +30,38 @@ julia> pairwise(Levenshtein(), iter, iter2) # asymmetric
 """
 pairwise
 
-pairwise(dist::StringDistance, X, Y; eltype = Float64, precalc = nothing) =
-    pairwise!(_allocmatrix(X, Y, eltype), dist, X, Y; precalc = precalc)
+Distances.pairwise(dist::StringDistance, X, Y; eltype = Float64, preprocess = nothing) =
+    pairwise!(_allocmatrix(X, Y, eltype), dist, X, Y; preprocess = preprocess)
 
-pairwise(dist::StringDistance, X; eltype = Float64, precalc = nothing) =
-    pairwise!(_allocmatrix(X, eltype), dist, X; precalc = precalc)
+Distances.pairwise(dist::StringDistance, X; eltype = Float64, preprocess = nothing) =
+    pairwise!(_allocmatrix(X, eltype), dist, X; preprocess = preprocess)
 
-pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X; precalc = nothing) where {N<:Number} =
+pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X; preprocess = nothing) where {N<:Number} =
     (dist isa SemiMetric) ?
-        _symmetric_pairwise!(R, dist, X; precalc = precalc) :
-        _asymmetric_pairwise!(R, dist, X, X; precalc = precalc)
+        _symmetric_pairwise!(R, dist, X; preprocess = preprocess) :
+        _asymmetric_pairwise!(R, dist, X, X; preprocess = preprocess)
 
-pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X, Y; precalc = nothing) where {N<:Number} =
-    _asymmetric_pairwise!(R, dist, X, Y; precalc = precalc)
+pairwise!(R::AbstractMatrix{N}, dist::StringDistance, X, Y; preprocess = nothing) where {N<:Number} =
+    _asymmetric_pairwise!(R, dist, X, Y; preprocess = preprocess)
 
-_precalc(X, PT, q) = PT[PT(X[i], q) for i in 1:length(X)]
+_preprocess(X, PT, q) = PT[PT(X[i], q) for i in 1:length(X)]
 
 const PrecalcMinLength = 5 # Only precalc if length >= 5
 
-function precalc_if_needed(X, dist::StringDistance, precalc, precalcType)
-    # precalc only if a QGramDistance and
+preprocess_if_needed(X, dist::StringDistance, preprocess, preprocessType) = X
+
+function preprocess_if_needed(X, dist::QGramDistance, preprocess, preprocessType)
+    # preprocess only if a QGramDistance and
     # if precalc set to true or if isnothing and length is at least min length
-    !isa(dist, QGramDistance) && return X
-    cond = (precalc === true) ||
-                (isnothing(precalc) & length(X) >= PrecalcMinLength)
-    cond ? _precalc(X, precalcType, dist.q) : X
+    cond = (preprocess === true) ||
+                (isnothing(preprocess) && length(X) >= PrecalcMinLength)
+    cond ? _preprocess(X, preprocessType, dist.q) : X
 end
 
 function _symmetric_pairwise!(R, dist::StringDistance, X;
-    precalc = nothing, precalcType = QGramSortedVector)
+    preprocess = nothing, preprocessType = QGramSortedVector)
 
-    objs = precalc_if_needed(X, dist, precalc, precalcType)
+    objs = preprocess_if_needed(X, dist, preprocess, preprocessType)
 
     for i in 1:length(objs)
         R[i, i] = 0
@@ -74,10 +73,10 @@ function _symmetric_pairwise!(R, dist::StringDistance, X;
 end
 
 function _asymmetric_pairwise!(R, dist::StringDistance, X, Y;
-    precalc = nothing, precalcType = QGramSortedVector)
+    preprocess = nothing, preprocessType = QGramSortedVector)
 
-    objsX = precalc_if_needed(X, dist, precalc, precalcType)
-    objsY = precalc_if_needed(Y, dist, precalc, precalcType)
+    objsX = preprocess_if_needed(X, dist, preprocess, preprocessType)
+    objsY = preprocess_if_needed(Y, dist, preprocess, preprocessType)
 
     for i in 1:length(objsX)
         Threads.@threads for j in 1:length(objsY)
diff --git a/test/pairwise.jl b/test/pairwise.jl
index 24ebe2f..9a42a47 100644
--- a/test/pairwise.jl
+++ b/test/pairwise.jl
@@ -71,9 +71,9 @@ TestStrings2 = ["mew", "ab"]
 			end
 		end
 
-		# Ensure same result if precalculating for QGramDistances
+		# Ensure same result if preprocessing for QGramDistances
 		if DT <: QGramDistance
-			R4 = pairwise(d, TestStrings1; precalc = true)
+			R4 = pairwise(d, TestStrings1; preprocess = true)
 			@test typeof(R4) == typeof(R)
 			@test size(R4) == size(R)
 			for i in 1:size(R4, 1)
diff --git a/test/performance/pairwise.jl b/test/performance/pairwise.jl
index 941b6e8..ae3bc26 100644
--- a/test/performance/pairwise.jl
+++ b/test/performance/pairwise.jl
@@ -26,8 +26,8 @@ S = String[randstring(rand(3:Maxlength)) for _ in 1:N]
 println("For ", Threads.nthreads(), " threads and ", N, " strings of max length ", Maxlength, ":")
 
 dist = Cosine(2)
-t1 = @belapsed dm1 = pairwise(dist, S; precalc = false)
-t2 = @belapsed dm2 = pairwise(dist, S; precalc = true)
+t1 = @belapsed dm1 = pairwise(dist, S; preprocess = false)
+t2 = @belapsed dm2 = pairwise(dist, S; preprocess = true)
 
 println("  - time WITHOUT pre-calculation: ", round(t1, digits = 3))
 println("  - time WITH    pre-calculation: ", round(t2, digits = 3))

From 7350e4004f0da11af3df6e3900bec074a082afd8 Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Sun, 8 Nov 2020 15:49:22 +0100
Subject: [PATCH 09/21] try to fix the docstring problem

---
 src/pairwise.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pairwise.jl b/src/pairwise.jl
index ccdee80..fa57db0 100644
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@@ -28,7 +28,7 @@ julia> pairwise(Levenshtein(), iter, iter2) # asymmetric
  10.0
 ```
 """
-pairwise
+Distances.pairwise
 
 Distances.pairwise(dist::StringDistance, X, Y; eltype = Float64, preprocess = nothing) =
     pairwise!(_allocmatrix(X, Y, eltype), dist, X, Y; preprocess = preprocess)

From 50747cf4c60ccc28dd5742d8a4a1b4f8cb9d7c52 Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Sun, 8 Nov 2020 15:54:41 +0100
Subject: [PATCH 10/21] docstring more in line with the one in Distances

---
 src/pairwise.jl | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/pairwise.jl b/src/pairwise.jl
index fa57db0..174225a 100644
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@@ -5,11 +5,12 @@ _allocmatrix(X, T) = Matrix{T}(undef, length(X), length(X))
     pairwise(dist::StringDistance, itr; eltype = Float64, preprocess = nothing)
     pairwise(dist::StringDistance, itr1, itr2; eltype = Float64, preprocess = nothing)
 
-`pairwise` returns the distance matrix between all pairs of elements in `itr`
-according to the `StringDistance` `dist`. The element type of the returned matrix
-can be set via `eltype`. For QGramDistances preprocessing will be used either
-if `preprocess` is set to true or if there are more than 5 elements in `itr`.
-Set `preprocess` to false if no precalculation should be used, regardless of length.
+Compute distances between all pairs of elements in `itr`according to the `StringDistance` 
+`dist`. The element type of the returned distance matrix can be set via `eltype`. 
+
+For QGramDistances preprocessing will be used either if `preprocess` is set to true or 
+if there are more than 5 elements in `itr`. Set `preprocess` to false if no 
+preprocessing should be used, regardless of length.
 
 Both symmetric and asymmetric versions are available.
 

From 624184382baf3d27a2c9102b4485148c8d20f135 Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Sun, 8 Nov 2020 15:56:09 +0100
Subject: [PATCH 11/21] fixed typo and formatting of docstring

---
 src/pairwise.jl | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/pairwise.jl b/src/pairwise.jl
index 174225a..93bdd20 100644
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@@ -5,12 +5,13 @@ _allocmatrix(X, T) = Matrix{T}(undef, length(X), length(X))
     pairwise(dist::StringDistance, itr; eltype = Float64, preprocess = nothing)
     pairwise(dist::StringDistance, itr1, itr2; eltype = Float64, preprocess = nothing)
 
-Compute distances between all pairs of elements in `itr`according to the `StringDistance` 
-`dist`. The element type of the returned distance matrix can be set via `eltype`. 
+Compute distances between all pairs of elements in `itr` according to the
+`StringDistance` `dist`. The element type of the returned distance matrix
+can be set via `eltype`.
 
-For QGramDistances preprocessing will be used either if `preprocess` is set to true or 
-if there are more than 5 elements in `itr`. Set `preprocess` to false if no 
-preprocessing should be used, regardless of length.
+For QGramDistances preprocessing will be used either if `preprocess` is set 
+to true or if there are more than 5 elements in `itr`. Set `preprocess` to 
+false if no preprocessing should be used, regardless of length.
 
 Both symmetric and asymmetric versions are available.
 

From 45ae2d96461f001e1e98d0547b7b1ff34d05ced5 Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Sun, 8 Nov 2020 16:06:52 +0100
Subject: [PATCH 12/21] cache test strings between runs for more comparable
 results

---
 test/performance/pairwise.jl | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/test/performance/pairwise.jl b/test/performance/pairwise.jl
index ae3bc26..fd9a060 100644
--- a/test/performance/pairwise.jl
+++ b/test/performance/pairwise.jl
@@ -21,7 +21,31 @@ else
     100 # default value
 end
 
-S = String[randstring(rand(3:Maxlength)) for _ in 1:N]
+# If there are strings already cached to disk we start with them and only
+# add new ones if needed.
+using Serialization
+const CacheFile = joinpath(@__DIR__(), "perfteststrings_$(Maxlength).juliabin")
+S = if isfile(CacheFile)
+    try
+        res = deserialize(CacheFile)
+        println("Read $(length(res)) strings from cache file: $CacheFile")
+        res
+    catch err
+        String[]
+    end
+else
+    println("Creating $N random strings.")
+    String[randstring(rand(3:Maxlength)) for _ in 1:N]
+end
+
+if length(S) < N
+    for i in (length(S)+1):N
+        push!(S, randstring(rand(3:Maxlength)))
+    end
+    println("Saving cache file with $(length(S)) strings: $CacheFile")
+    serialize(CacheFile, S)
+end
+
 
 println("For ", Threads.nthreads(), " threads and ", N, " strings of max length ", Maxlength, ":")
 

From 6c5a14e6694518b13cb7d76c829b3fc49d78c83e Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Sun, 8 Nov 2020 16:10:28 +0100
Subject: [PATCH 13/21] ensure test strigns cached also if newly created

---
 test/performance/pairwise.jl | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test/performance/pairwise.jl b/test/performance/pairwise.jl
index fd9a060..b92eb7c 100644
--- a/test/performance/pairwise.jl
+++ b/test/performance/pairwise.jl
@@ -25,6 +25,8 @@ end
 # add new ones if needed.
 using Serialization
 const CacheFile = joinpath(@__DIR__(), "perfteststrings_$(Maxlength).juliabin")
+SaveCache = false
+
 S = if isfile(CacheFile)
     try
         res = deserialize(CacheFile)
@@ -36,12 +38,17 @@ S = if isfile(CacheFile)
 else
     println("Creating $N random strings.")
     String[randstring(rand(3:Maxlength)) for _ in 1:N]
+    SaveCache = true
 end
 
 if length(S) < N
     for i in (length(S)+1):N
         push!(S, randstring(rand(3:Maxlength)))
     end
+    SaveCache = true
+end
+
+if SaveCache
     println("Saving cache file with $(length(S)) strings: $CacheFile")
     serialize(CacheFile, S)
 end

From 9267d07b34e7adc96330bc181913aa70edd46617 Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Sun, 8 Nov 2020 16:11:58 +0100
Subject: [PATCH 14/21] fixed ordering bug in test script

---
 test/performance/pairwise.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/performance/pairwise.jl b/test/performance/pairwise.jl
index b92eb7c..b5d88f6 100644
--- a/test/performance/pairwise.jl
+++ b/test/performance/pairwise.jl
@@ -37,8 +37,8 @@ S = if isfile(CacheFile)
     end
 else
     println("Creating $N random strings.")
-    String[randstring(rand(3:Maxlength)) for _ in 1:N]
     SaveCache = true
+    String[randstring(rand(3:Maxlength)) for _ in 1:N]
 end
 
 if length(S) < N

From a24cd49e16b7eeb24206093f14ef0a2f3534e531 Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Tue, 10 Nov 2020 10:32:34 +0100
Subject: [PATCH 15/21] added the MorisitaOverlap distance which uses the
 multiplicities of q-grams

---
 README.md              |  1 +
 src/StringDistances.jl |  1 +
 src/distances/qgram.jl | 46 +++++++++++++++++++++++++++++++++++++++++-
 test/distances.jl      | 16 +++++++++++++++
 4 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 22f63ac..1b977e9 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ The available distances are:
 	- [Jaccard Distance](https://en.wikipedia.org/wiki/Jaccard_index) `Jaccard(q::Int)`
 	- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int)`
 	- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int)`
+	- [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int)`
 - Distance "modifiers" that can be applied to any distance:
 	- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the normalized distance between the shorter string and substrings of the longer string.
 	- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the normalized distance of the two strings, after re-ordering words alphabetically. 
diff --git a/src/StringDistances.jl b/src/StringDistances.jl
index 89efc47..ffc8b84 100755
--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@@ -35,6 +35,7 @@ Cosine,
 Jaccard,
 SorensenDice,
 Overlap,
+MorisitaOverlap,
 QGramDict,
 QGramSortedVector,
 Winkler,
diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl
index a420877..957d6bd 100755
--- a/src/distances/qgram.jl
+++ b/src/distances/qgram.jl
@@ -377,4 +377,48 @@ newcounter(d::IntersectionDist) = ThreeCounters{Int, typeof(d)}(0, 0, 0)
 	c.shared += (n1 > 0) & (n2 > 0)
 
 calculate(d::Overlap, c::ThreeCounters{Int, Overlap}) =
-	1.0 - c.shared / min(c.left, c.right)
\ No newline at end of file
+	1.0 - c.shared / min(c.left, c.right)
+
+"""
+	MorisitaOverlap(q::Int)
+
+Creates a MorisitaOverlap distance, a general, statistical measure of
+dispersion which can also be used on dictionaries such as created
+from q-grams. See https://en.wikipedia.org/wiki/Morisita%27s_overlap_index
+
+The distance corresponds to
+
+``(2 * sum(m(s1) .* m(s2)) / (sum(m(s1).^2)*M(s2)/M(s1) + sum(m(s2).^2)*M(s1)/M(s2))``
+
+where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
+sum of those counts.
+"""
+struct MorisitaOverlap <: QGramDistance
+	q::Int
+end
+
+mutable struct FiveCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
+	leftsum::T    # sum(m(s1))
+	rightsum::T   # sum(m(s2))
+	leftsq::T     # sum(m(s1).^2)
+	rightsq::T    # sum(m(s2).^2)
+	shared::T     # sum(m(s1) .* m(s2))
+end
+
+newcounter(d::MorisitaOverlap) = FiveCounters{Int, MorisitaOverlap}(0, 0, 0, 0, 0)
+
+@inline function countleft!(c::FiveCounters{Int, MorisitaOverlap}, n1::Integer)
+	c.leftsum += n1
+	c.leftsq += (n1^2)
+end
+
+@inline function countright!(c::FiveCounters{Int, MorisitaOverlap}, n2::Integer)
+	c.rightsum += n2
+	c.rightsq += (n2^2)
+end
+
+@inline countshared!(c::FiveCounters{Int, MorisitaOverlap}, n1::Integer, n2::Integer) =
+	c.shared += (n1 * n2)
+
+calculate(d::MorisitaOverlap, c::FiveCounters{Int, MorisitaOverlap}) =
+	(2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum)
diff --git a/test/distances.jl b/test/distances.jl
index 7aaa8da..15f00fd 100644
--- a/test/distances.jl
+++ b/test/distances.jl
@@ -130,6 +130,22 @@ using StringDistances, Unicode, Test, Random
 		@test ismissing(evaluate(Overlap(1), "", missing))
 	end
 
+	@testset "MorisitaOverlap" begin
+		# overlap for 'n', 'h', and 't' and 5 q-grams per string:
+		@test evaluate(MorisitaOverlap(1), "night", "nacht") == 0.6 # ((2*3)/(5*5/5 + 5*5/5))
+
+		# overlap for 'o', 'n', 2-overlap for 'c' and 't' and 7 unique q-grams in total so multiplicity vectors
+		# ms1 = [1, 1, 1, 2, 1, 1, 0]
+		# ms2 = [2, 1, 1, 2, 0, 0, 1]
+		# sum(ms1 .* ms2) = 8, sum(ms1 .^ 2) = 9, sum(ms2 .^ 2) = 11, sum(ms1) = 7, sum(ms2) = 7
+		@test evaluate(MorisitaOverlap(1), "context", "contact") == 0.8 # ((2*8)/(9*7/7 + 11*7/7)) = 14/18
+		@test MorisitaOverlap(1)("context", "contact") == 0.8
+
+		@test result_type(MorisitaOverlap(1), "hello", "world") == typeof(float(1))
+		@inferred evaluate(MorisitaOverlap(1), "", "")
+		@test ismissing(evaluate(MorisitaOverlap(1), "", missing))
+	end
+
 	@testset "QGramDict and QGramSortedVector counts qgrams" begin
 		# To get something we can more easily compare to:
 		stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p))

From 36cd8cbfc55a53710af5e23c97e0a5fbe32350b8 Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Tue, 10 Nov 2020 10:35:05 +0100
Subject: [PATCH 16/21] more detailed doc string for MorisitaOverlap

---
 src/distances/qgram.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl
index 957d6bd..7d335d8 100755
--- a/src/distances/qgram.jl
+++ b/src/distances/qgram.jl
@@ -385,6 +385,9 @@ calculate(d::Overlap, c::ThreeCounters{Int, Overlap}) =
 Creates a MorisitaOverlap distance, a general, statistical measure of
 dispersion which can also be used on dictionaries such as created
 from q-grams. See https://en.wikipedia.org/wiki/Morisita%27s_overlap_index
+This is more fine-grained than many of the other QGramDistances since
+it is based on the counts per q-gram rather than only which q-grams are
+in the strings.
 
 The distance corresponds to
 

From 1a6f63206f6a44560f11fa76dbf9b3c3c9b3f00d Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Tue, 10 Nov 2020 10:41:45 +0100
Subject: [PATCH 17/21] fix 'bug' in comment

---
 test/distances.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/distances.jl b/test/distances.jl
index 15f00fd..2d9c94d 100644
--- a/test/distances.jl
+++ b/test/distances.jl
@@ -138,7 +138,7 @@ using StringDistances, Unicode, Test, Random
 		# ms1 = [1, 1, 1, 2, 1, 1, 0]
 		# ms2 = [2, 1, 1, 2, 0, 0, 1]
 		# sum(ms1 .* ms2) = 8, sum(ms1 .^ 2) = 9, sum(ms2 .^ 2) = 11, sum(ms1) = 7, sum(ms2) = 7
-		@test evaluate(MorisitaOverlap(1), "context", "contact") == 0.8 # ((2*8)/(9*7/7 + 11*7/7)) = 14/18
+		@test evaluate(MorisitaOverlap(1), "context", "contact") == 0.8 # ((2*8)/(9*7/7 + 11*7/7)) = 16/20
 		@test MorisitaOverlap(1)("context", "contact") == 0.8
 
 		@test result_type(MorisitaOverlap(1), "hello", "world") == typeof(float(1))

From 9417c89f1ced2ff3e379c7d9539096abfb083cb5 Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Tue, 10 Nov 2020 10:54:22 +0100
Subject: [PATCH 18/21] added MorisitaOverlap test also for 2-gram

---
 test/distances.jl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/distances.jl b/test/distances.jl
index 2d9c94d..2c8cef9 100644
--- a/test/distances.jl
+++ b/test/distances.jl
@@ -141,6 +141,12 @@ using StringDistances, Unicode, Test, Random
 		@test evaluate(MorisitaOverlap(1), "context", "contact") == 0.8 # ((2*8)/(9*7/7 + 11*7/7)) = 16/20
 		@test MorisitaOverlap(1)("context", "contact") == 0.8
 
+		# Multiplicity vectors for 2-grams "co", "on", "nt", "te", "ex", "xt", "ta", "ac", "ct"
+		# ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0]
+		# ms2 = [1, 1, 1, 0, 0, 0, 1, 1, 1]
+		# sum(ms1 .* ms2) = 3, sum(ms1 .^ 2) = 6, sum(ms2 .^ 2) = 6, sum(ms1) = 6, sum(ms2) = 6
+		@test MorisitaOverlap(2)("context", "contact") == 0.5 # ((2*3)/(6*6/6 + 6*6/6))
+
 		@test result_type(MorisitaOverlap(1), "hello", "world") == typeof(float(1))
 		@inferred evaluate(MorisitaOverlap(1), "", "")
 		@test ismissing(evaluate(MorisitaOverlap(1), "", missing))

From 5cc500053fa17c125c7a4cec4004958d28b0a1de Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Tue, 10 Nov 2020 11:11:03 +0100
Subject: [PATCH 19/21] also test MorisitaOverlap with preprocessing

---
 test/distances.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/distances.jl b/test/distances.jl
index 2c8cef9..4d06064 100644
--- a/test/distances.jl
+++ b/test/distances.jl
@@ -234,7 +234,7 @@ using StringDistances, Unicode, Test, Random
 	end
 
 	@testset "Differential testing of String, QGramDict, and QGramSortedVector" begin
-		for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap]
+		for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap, MorisitaOverlap]
 			for _ in 1:100
 				qlen = rand(2:9)
 				dist = D(qlen)

From 919a78aa08525f380cb119863ff5fa5a1fc1fcc6 Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Tue, 10 Nov 2020 16:57:30 +0100
Subject: [PATCH 20/21] fix bug in MorisitaOverlap and added
 NormalizedMultisetDistance (NMD)

---
 README.md              |  1 +
 src/StringDistances.jl |  2 ++
 src/distances/qgram.jl | 43 +++++++++++++++++++++++++++++++++++++++++-
 test/distances.jl      | 28 ++++++++++++++++++++++-----
 4 files changed, 68 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 1b977e9..721e033 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,7 @@ The available distances are:
 	- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int)`
 	- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int)`
 	- [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int)`
+	- [NormalizedMultisetDistance](https://www.sciencedirect.com/science/article/pii/S1047320313001417) `NormalizedMultisetDistance(q::Int)` or `NMD(q::Int)`
 - Distance "modifiers" that can be applied to any distance:
 	- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the normalized distance between the shorter string and substrings of the longer string.
 	- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the normalized distance of the two strings, after re-ordering words alphabetically. 
diff --git a/src/StringDistances.jl b/src/StringDistances.jl
index ffc8b84..be59d7b 100755
--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@@ -36,6 +36,8 @@ Jaccard,
 SorensenDice,
 Overlap,
 MorisitaOverlap,
+NormalizedMultisetDistance,
+NMD,
 QGramDict,
 QGramSortedVector,
 Winkler,
diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl
index 7d335d8..b16d124 100755
--- a/src/distances/qgram.jl
+++ b/src/distances/qgram.jl
@@ -424,4 +424,45 @@ end
 	c.shared += (n1 * n2)
 
 calculate(d::MorisitaOverlap, c::FiveCounters{Int, MorisitaOverlap}) =
-	(2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum)
+	1.0 - ((2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum))
+
+"""
+	NormalizedMultisetDistance(q::Int)
+	NMD(q::Int)
+
+Creates a NormalizedMultisetDistance (NMD) distance as introduced by Besiris and
+Zigouris 2013.
+See https://www.sciencedirect.com/science/article/pii/S1047320313001417
+
+The distance corresponds to
+
+``(sum(max.(m(s1), m(s2)) - min(M(s1), M(s2))) / max(M(s1), M(s2))``
+
+where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
+sum of those counts.
+"""
+struct NormalizedMultisetDistance <: QGramDistance
+	q::Int
+end
+const NMD = NormalizedMultisetDistance # frequently used acronym
+
+newcounter(d::NMD) = ThreeCounters{Int, NMD}(0, 0, 0)
+
+@inline function countleft!(c::ThreeCounters{Int, NMD}, n1::Integer)
+	c.left += n1
+	c.shared += n1 # max(n1, 0) == n1
+end
+
+@inline function countright!(c::ThreeCounters{Int, NMD}, n2::Integer)
+	c.right += n2
+	c.shared += n2 # max(n2, 0) == n2
+end
+
+@inline function countboth!(c::ThreeCounters{Int, NMD}, n1::Integer, n2::Integer)
+	c.left += n1
+	c.right += n2
+	c.shared += max(n1, n2)
+end
+
+calculate(d::NMD, c::ThreeCounters{Int, NMD}) =
+	(c.shared - min(c.left, c.right)) / max(c.left, c.right)
diff --git a/test/distances.jl b/test/distances.jl
index 4d06064..003a452 100644
--- a/test/distances.jl
+++ b/test/distances.jl
@@ -132,26 +132,44 @@ using StringDistances, Unicode, Test, Random
 
 	@testset "MorisitaOverlap" begin
 		# overlap for 'n', 'h', and 't' and 5 q-grams per string:
-		@test evaluate(MorisitaOverlap(1), "night", "nacht") == 0.6 # ((2*3)/(5*5/5 + 5*5/5))
+		@test evaluate(MorisitaOverlap(1), "night", "nacht") == 0.4 # 1.0-((2*3)/(5*5/5 + 5*5/5))
 
 		# overlap for 'o', 'n', 2-overlap for 'c' and 't' and 7 unique q-grams in total so multiplicity vectors
 		# ms1 = [1, 1, 1, 2, 1, 1, 0]
 		# ms2 = [2, 1, 1, 2, 0, 0, 1]
 		# sum(ms1 .* ms2) = 8, sum(ms1 .^ 2) = 9, sum(ms2 .^ 2) = 11, sum(ms1) = 7, sum(ms2) = 7
-		@test evaluate(MorisitaOverlap(1), "context", "contact") == 0.8 # ((2*8)/(9*7/7 + 11*7/7)) = 16/20
-		@test MorisitaOverlap(1)("context", "contact") == 0.8
+		@test evaluate(MorisitaOverlap(1), "context", "contact") ≈ .2 atol = 1e-4 # 1.0-((2*8)/(9*7/7 + 11*7/7)) = 16/20
+		@test MorisitaOverlap(1)("context", "contact") ≈ .2 atol = 1e-4
 
 		# Multiplicity vectors for 2-grams "co", "on", "nt", "te", "ex", "xt", "ta", "ac", "ct"
 		# ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0]
 		# ms2 = [1, 1, 1, 0, 0, 0, 1, 1, 1]
 		# sum(ms1 .* ms2) = 3, sum(ms1 .^ 2) = 6, sum(ms2 .^ 2) = 6, sum(ms1) = 6, sum(ms2) = 6
-		@test MorisitaOverlap(2)("context", "contact") == 0.5 # ((2*3)/(6*6/6 + 6*6/6))
+		@test MorisitaOverlap(2)("context", "contact") == 0.5 # 1.0-((2*3)/(6*6/6 + 6*6/6))
 
 		@test result_type(MorisitaOverlap(1), "hello", "world") == typeof(float(1))
 		@inferred evaluate(MorisitaOverlap(1), "", "")
 		@test ismissing(evaluate(MorisitaOverlap(1), "", missing))
 	end
 
+	@testset "NMD" begin
+		# m(s1) = [1, 1, 1, 1, 1, 0, 0], m(s2) = [1, 0, 0, 1, 1, 1, 1]
+		@test evaluate(NMD(1), "night", "nacht") == 0.4 # (7-5)/5
+
+		# ms1 = [1, 1, 1, 2, 1, 1, 0]
+		# ms2 = [2, 1, 1, 2, 0, 0, 1]
+		@test evaluate(NMD(1), "context", "contact") ≈ 0.2857 atol = 1e-4 # ((2+1+1+2+1+1+1)-7)/(7)
+		@test NMD(1)("context", "contact") ≈ 0.2857 atol = 1e-4
+
+		# ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0]
+		# ms2 = [1, 1, 1, 0, 0, 0, 1, 1, 1]
+		@test NMD(2)("context", "contact") == 0.5 # ((1+1+1+1+1+1+1+1+1)-6)/6
+
+		@test result_type(NMD(1), "hello", "world") == typeof(float(1))
+		@inferred evaluate(NMD(1), "", "")
+		@test ismissing(evaluate(NMD(1), "", missing))
+	end
+
 	@testset "QGramDict and QGramSortedVector counts qgrams" begin
 		# To get something we can more easily compare to:
 		stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p))
@@ -234,7 +252,7 @@ using StringDistances, Unicode, Test, Random
 	end
 
 	@testset "Differential testing of String, QGramDict, and QGramSortedVector" begin
-		for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap, MorisitaOverlap]
+		for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap, MorisitaOverlap, NMD]
 			for _ in 1:100
 				qlen = rand(2:9)
 				dist = D(qlen)

From d1c283a517da8aec835f5f8eb365b0dbd5b8e659 Mon Sep 17 00:00:00 2001
From: Robert Feldt <robert.feldt@gmail.com>
Date: Tue, 10 Nov 2020 17:10:36 +0100
Subject: [PATCH 21/21] better docstring for NMD

---
 src/distances/qgram.jl | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl
index b16d124..5efd87d 100755
--- a/src/distances/qgram.jl
+++ b/src/distances/qgram.jl
@@ -431,8 +431,9 @@ calculate(d::MorisitaOverlap, c::FiveCounters{Int, MorisitaOverlap}) =
 	NMD(q::Int)
 
 Creates a NormalizedMultisetDistance (NMD) distance as introduced by Besiris and
-Zigouris 2013.
-See https://www.sciencedirect.com/science/article/pii/S1047320313001417
+Zigouris 2013. The goal with this distance is to behave similarly to a normalized
+compression distance without having to do any actual compression (and thus being
+faster to compute).
 
 The distance corresponds to
 
@@ -440,6 +441,9 @@ The distance corresponds to
 
 where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
 sum of those counts.
+
+For details see:
+https://www.sciencedirect.com/science/article/pii/S1047320313001417
 """
 struct NormalizedMultisetDistance <: QGramDistance
 	q::Int