-
Notifications
You must be signed in to change notification settings - Fork 36
/
stringdist.R
364 lines (328 loc) · 11.5 KB
/
stringdist.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
#' A package for string distance calculation and approximate string matching.
#'
#'
#' The \pkg{stringdist} package offers fast and platform-independent string
#' metrics. Its main purpose is to compute various string distances and to do
#' approximate text matching between character vectors. As of version 0.9.3,
#' it is also possible to compute distances between sequences represented by
#' integer vectors.
#'
#'
#' A typical use is to match strings that are not precisely the same. For
#' example
#'
#' \code{ amatch(c("hello","g'day"),c("hi","hallo","ola"),maxDist=2)}
#'
#' returns \code{c(2,NA)} since \code{"hello"} matches closest with
#' \code{"hallo"}, and within the maximum (optimal string alignment) distance.
#' The second element, \code{"g'day"}, matches closest with \code{"ola"} but
#' since the distance equals 4, no match is reported.
#'
#' A second typical use is to compute string distances. For example
#'
#' \code{ stringdist(c("g'day"),c("hi","hallo","ola"))}
#'
#' Returns \code{c(5,5,4)} since these are the distances between \code{"g'day"}
#' and respectively \code{"hi"}, \code{"hallo"}, and \code{"ola"}.
#'
#' A third typical use would be to compute a \code{dist} object. The command
#'
#' \code{stringdistmatrix(c("foo","bar","boo","baz"))}
#'
#' returns an object of class \code{dist} that can be used by clustering
#' algorithms such as \code{stats::hclust}.
#'
#' A fourth use is to compute string distances between general sequences,
#' represented as integer vectors (which must be stored in a \code{list}):
#'
#' \code{seq_dist( list(c(1L,1L,2L)), list(c(1L,2L,1L),c(2L,3L,1L,2L)) )}
#'
#' The above code yields the vector \code{c(1,2)} (the first shorter first
#' argument is recycled over the longer second argument)
#'
#' Besides documentation for each function, the main topics documented are:
#'
#' \itemize{
#' \item{\code{\link{stringdist-metrics}} -- string metrics supported by the package}
#' \item{\code{\link{stringdist-encoding}} -- how encoding is handled by the package}
#' \item{\code{\link{stringdist-parallelization}} -- on multithreading }
#' }
#'
#' @section Acknowledgements:
#' \itemize{
#' \item{The code for the full Damerau-Levenshtein distance was adapted from Nick Logan's
#' \href{https://github.com/ugexe/Text--Levenshtein--Damerau--XS/blob/master/damerau-int.c}{public github repository}.}
#' \item{C code for converting UTF-8 to integer was copied from the R core for performance reasons.}
#' \item{The code for soundex conversion and string similarity was kindly contributed by Jan van der Laan.}
#' }
#' @section Citation:
#' If you would like to cite this package, please cite the \href{https://journal.r-project.org/archive/2014-1/loo.pdf}{R Journal Paper}:
#' \itemize{
#' \item{M.P.J. van der Loo (2014). The \code{stringdist} package for approximate string matching.
#' R Journal 6(1) pp 111-122}
#' }
#' Or use \code{citation('stringdist')} to get a bibtex item.
#'
#' @name stringdist-package
#' @docType package
#' @useDynLib stringdist, .registration=TRUE
#' @importFrom parallel detectCores
#'
#'
#'
{}
listwarning <- function(x,y){
sprintf("
You are passing one or more arguments of type 'list' to
'%s'. These arguments will be converted with 'as.character'
which is likeley not to give what you want (did you mean to use '%s'?).
This warning can be avoided by explicitly converting the argument(s).
",x,y)
}
#' Compute distance metrics between strings
#'
#'
#' \code{stringdist} computes pairwise string distances between elements of
#' \code{a} and \code{b}, where the argument with less elements is recycled.
#' \code{stringdistmatrix} computes the string distance matrix with rows
#' according to
#' \code{a} and columns according to \code{b}.
#'
#'
#' @param a R object (target); will be converted by \code{as.character}
#' @param b R object (source); will be converted by \code{as.character}
#' This argument is optional for \code{stringdistmatrix} (see section \code{Value}).
#' @param method Method for distance calculation. The default is \code{"osa"},
#' see \code{\link{stringdist-metrics}}.
#' @param useBytes Perform byte-wise comparison, see
#' \code{\link{stringdist-encoding}}.
#' @param weight For \code{method='osa'} or \code{'dl'}, the penalty for
#' deletion, insertion, substitution and transposition, in that order. When
#' \code{method='lv'}, the penalty for transposition is ignored. When
#' \code{method='jw'}, the weights associated with characters of \code{a},
#' characters from \code{b} and the transposition weight, in that order.
#' Weights must be positive and not exceed 1. \code{weight} is ignored
#' completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'},
#' \code{'Jaccard'}, \code{'lcs'}, or \code{soundex}.
#' @param q Size of the \eqn{q}-gram; must be nonnegative. Only applies to
#' \code{method='qgram'}, \code{'jaccard'} or \code{'cosine'}.
#' @param p Prefix factor for Jaro-Winkler distance. The valid range for
#' \code{p} is \code{0 <= p <= 0.25}. If \code{p=0} (default), the
#' Jaro-distance is returned. Applies only to \code{method='jw'}.
#' @param bt Winkler's boost threshold. Winkler's prefix factor is
#' only applied when the Jaro distance is larger than \code{bt}.
#' Applies only to \code{method='jw'} and \code{p>0}.
#' @param nthread Maximum number of threads to use. By default, a sensible
#' number of threads is chosen, see \code{\link{stringdist-parallelization}}.
#'
#' @seealso \code{\link{stringsim}}, \code{\link{qgrams}}, \code{\link{amatch}}, \code{\link{afind}}
#'
#' @return For \code{stringdist}, a vector with string distances of size
#' \code{max(length(a),length(b))}.
#'
#' For \code{stringdistmatrix}: if both \code{a} and \code{b} are passed, a
#' \code{length(a)xlength(b)} \code{matrix}. If a single argument \code{a} is
#' given an object of class \code{\link[stats]{dist}} is returned.
#'
#' Distances are nonnegative if they can be computed, \code{NA} if any of the
#' two argument strings is \code{NA} and \code{Inf} when \code{maxDist} is
#' exceeded or, in case of the hamming distance, when the two compared strings
#' have different length.
#'
#'
#' @example ../examples/stringdist.R
#' @export
stringdist <- function(a, b
, method=c("osa","lv","dl","hamming","lcs", "qgram","cosine","jaccard","jw","soundex")
, useBytes = FALSE
, weight=c(d=1,i=1,s=1,t=1)
, q = 1
, p = 0
, bt = 0
, nthread = getOption("sd_num_thread")
){
if (is.list(a)|is.list(b))
warning(listwarning("stringdist","seq_dist"))
stopifnot(
all(is.finite(weight))
, all(weight > 0)
, all(weight <=1)
, q >= 0
, p <= 0.25
, p >= 0
, is.logical(useBytes)
, ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE)
, ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE)
, nthread > 0
)
# note: enc2utf8 is very efficient when the native encoding is already UTF-8.
a <- as.character(a)
b <- as.character(b)
if ( !useBytes ){
a <- enc2utf8(a)
b <- enc2utf8(b)
}
if (length(a) == 0 || length(b) == 0){
return(numeric(0))
}
if ( max(length(a),length(b)) %% min(length(a),length(b)) != 0 ){
warning(RECYCLEWARNING)
}
method <- match.arg(method)
nthread <- as.integer(nthread)
if (method == 'jw') weight <- weight[c(2,1,3)]
do_dist(a=b, b=a
, method=method
, weight=weight
, q=q
, p=p
, bt=bt
, useBytes=useBytes
, nthread=nthread)
}
#' @param useNames Use input vectors as row and column names?
#'
#'
#' @rdname stringdist
#' @export
stringdistmatrix <- function(a, b
, method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard","jw","soundex")
, useBytes = FALSE
, weight=c(d=1,i=1,s=1,t=1)
, q = 1
, p = 0
, bt = 0
, useNames=c('none','strings','names')
, nthread = getOption("sd_num_thread")
){
if (is.list(a)|| (!missing(b) && is.list(b)) ){
warning(listwarning("stringdistmatrix","seq_distmatrix"))
}
# for backward compatability with stringdist <= 0.9.0
if (identical(useNames, FALSE)) useNames <- "none"
if (identical(useNames, TRUE)) useNames <- "strings"
useNames <- match.arg(useNames)
method <- match.arg(method)
nthread <- as.integer(nthread)
stopifnot(
all(is.finite(weight))
, all(weight > 0)
, all(weight <=1)
, q >= 0
, p <= 0.25
, p >= 0
, is.logical(useBytes)
, ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE)
, ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE)
, nthread > 0
)
if (method == 'jw') weight <- weight[c(2,1,3)]
# if b is missing, generate a 'dist' object.
if (missing(b)){
if (useNames == "names"){
a <- setNames(as.character(a),names(a))
} else {
a <- as.character(a)
}
return( lower_tri(a
, method=method
, useBytes=useBytes
, weight=weight
, q=q
, p=p
, bt=bt
, useNames=useNames
, nthread=nthread)
)
}
if (useNames == "names"){
rowns <- names(a)
colns <- names(b)
}
# NOTE: this strips off names
a <- as.character(a)
b <- as.character(b)
if (useNames=="strings"){
rowns <- a
colns <- b
}
if (!useBytes){
a <- enc2utf8(a)
b <- enc2utf8(b)
}
if (length(a) == 0 || length(b) == 0){
return(matrix(numeric(0)))
}
x <- vapply(b, do_dist, USE.NAMES=FALSE, FUN.VALUE=numeric(length(a))
, a, method,weight, q, p, bt, useBytes, nthread)
if (useNames %in% c("strings","names") ){
structure(matrix(x,nrow=length(a),ncol=length(b), dimnames=list(rowns,colns)))
} else {
matrix(x,nrow=length(a),ncol=length(b))
}
}
char2int <- function(x){
# For some OS's enc2utf8 had unexpected behavior for NA's,
# see https://bugs.r-project.org/bugzilla3/show_bug.cgi?id=15201.
# This is fixed for R >= 2.15.3.
# i <- !is.na(x)
# x[i] <- enc2utf8(x[i])
lapply(enc2utf8(x),utf8ToInt)
}
# enum-type in stringdist.h
METHODS <- c(
osa = 0L
, lv = 1L
, dl = 2L
, hamming = 3L
, lcs = 4L
, qgram = 5L
, cosine = 6L
, jaccard = 7L
, jw = 8L
, soundex = 9L
, running_cosine = 10L
)
do_dist <- function(a, b, method, weight, q, p, bt, useBytes=FALSE, nthread=1L){
if (method=='soundex' && !all(printable_ascii(a) & printable_ascii(b)) ){
warning("Non-printable ascii or non-ascii characters in soundex. Results may be unreliable. See ?printable_ascii.")
}
method <- METHODS[method]
if ( is.na(method) ){
stop(sprintf("method '%s' is not defined",method))
}
d <- .Call("R_stringdist", a, b, method
, as.double(weight), as.double(p), as.double(bt), as.integer(q)
, as.integer(useBytes), as.integer(nthread)
, PACKAGE="stringdist"
)
d
}
# more efficient function that returns a square distance matrix as a 'stats::dist' object.
lower_tri <- function(a
, method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard","jw","soundex")
, useBytes = FALSE
, weight=c(d=1,i=1,s=1,t=1)
, q=1
, p=0
, bt=0
, useNames=FALSE
, nthread = getOption("sd_num_thread")
){
methnr <- METHODS[method]
if (is.na(method)){
stop(sprintf("method '%s' is not defined",method))
}
x <- .Call("R_lower_tri", a, methnr
, as.double(weight), as.double(p), as.double(bt)
, as.integer(q), as.integer(useBytes), as.integer(nthread)
, PACKAGE="stringdist")
attributes(x) <- list(class='dist'
, Size = length(a)
, Diag = FALSE
, Upper = FALSE
, method = method)
if (useNames == "strings") attr(x,"Labels") <- as.character(a)
if (useNames == "names" ) attr(x,"Labels") <- names(a)
x
}