diff --git a/pkg/R/afind.R b/pkg/R/afind.R index 5308453..68d6c58 100644 --- a/pkg/R/afind.R +++ b/pkg/R/afind.R @@ -31,7 +31,10 @@ #' The functions \code{grab} and \code{grabl} are approximate string matching #' functions that mimic base R's \code{\link[base]{grep}} and #' \code{\link[base:grep]{grepl}}. They are implemented as convenience wrappers -#' of \code{find}. +#' of \code{find}. For \code{grabl} there is one difference with \code{grepl}. +#' The result of \code{grepl("foo",NA)} is \code{FALSE}, which seems inconsistent +#' with \code{grepl(NA, NA)} and \code{grepl(NA, "foo")} which return \code{NA}. +#' \code{grabl} returns \code{NA} when either \code{pattern} or \code{x} is \code{NA}. #' #' @section Running cosine distance: #' This algorithm gains efficiency by using that two consecutive windows have @@ -174,16 +177,16 @@ grab <- function(x, pattern, maxDist=Inf, value=FALSE, ...){ grabl <- function(x, pattern, maxDist=Inf, ...){ stopifnot(is.numeric(maxDist), maxDist >= 0, length(pattern) == 1) L <- afind(x, pattern, value=FALSE, ...) - L$distance <= maxDist + as.logical(L$distance <= maxDist) } #' @rdname afind #' #' @return -#' For \code{extract}, a \code{character} vector of \code{length(x)}, with -#' \code{NA} where no match was found and the first matched string if there is -#' a match. (similar to \code{stringr::str_extract}). +#' For \code{extract}, a \code{character} matrix with \code{length(x)} rows and +#' \code{length(pattern)} columns. If match was found, element \eqn{(i,j)} +#' contains the match, otherwise it is set to \code{NA}. #' @export extract <- function(x, pattern, maxDist = Inf, ...){ stopifnot(is.numeric(maxDist), maxDist >= 0, length(pattern) == 1) diff --git a/pkg/inst/tinytest/test_afind.R b/pkg/inst/tinytest/test_afind.R index ca327dc..d522515 100644 --- a/pkg/inst/tinytest/test_afind.R +++ b/pkg/inst/tinytest/test_afind.R @@ -51,10 +51,10 @@ expect_equal(length(out2), 2) expect_equal(grab(texts, "harvester", maxDist=2), 2) expect_equal(grab(texts, "harvester", value=TRUE, maxDist=2), "harverste") expect_equal(grabl(texts, "harvester", maxDist=2) - , matrix(c(FALSE,TRUE,FALSE,FALSE),nrow=4)) + , c(FALSE,TRUE,FALSE,FALSE)) expect_equal(extract(texts, "harvester", maxDist=2) - , matrix(c(NA, "harverste",NA,NA), nrow=4) ) + , matrix(c(NA, "harverste",NA,NA),nrow=4) ) ## Test running_cosine pattern <- c("phish", "want to") @@ -76,7 +76,30 @@ for ( method in methods ){ expect_equal(afind(text, pattern, method=method, q=3, p=0.1)$location[1,1], 19, info=method) } +## test the usual edge cases +# notice: window size = 0. +expect_equal(afind("foo","")$distance[1], 0) + +expect_equal(afind("foo",NA)$distance[1], NA_real_) +expect_equal(afind("foo",NA)$location[1], NA_integer_) +expect_equal(afind("foo",NA)$match[1], NA_character_) + +expect_equal(afind(NA,"foo")$distance[1], NA_real_) +expect_equal(afind(NA,"foo")$location[1], NA_integer_) +expect_equal(afind(NA,"foo")$match[1], NA_character_) + +expect_equal(afind("","foo")$distance[1], 3) +expect_equal(afind("","foo")$location[1], 1) +expect_equal(afind("","foo")$match[1], "") + +expect_equal(grab("foo", ""), 1L) +expect_equal(grabl("foo",""), TRUE) +expect_equal(grab("foo",NA), integer(0)) + +# note that 'grepl' gives FALSE in this case (which is inconsistent with +# grepl(NA, NA), grepl(NA, "foo"). +expect_equal(grabl("foo",NA), NA) diff --git a/pkg/src/Rstringdist.c b/pkg/src/Rstringdist.c index fdcd4b9..f747e28 100644 --- a/pkg/src/Rstringdist.c +++ b/pkg/src/Rstringdist.c @@ -418,7 +418,7 @@ SEXP R_afind(SEXP a, SEXP pattern, SEXP width if (isna_s || isna_t){ // something to search in, or find? yloc[offset + i] = NA_INTEGER; ydist[offset + i] = NA_REAL; - } else if ( current_window >= len_s ){ // is the text shorter than the pattern? + } else if ( current_window >= len_s ){ // is the text shorter than the window? yloc[offset + i] = 1L; ydist[offset + i] = stringdist(sd, s, len_s, t, len_t); } else { // slide window over text and compute distances diff --git a/pkg/src/qgram.c b/pkg/src/qgram.c index 24fe77a..b93c2bc 100644 --- a/pkg/src/qgram.c +++ b/pkg/src/qgram.c @@ -456,7 +456,7 @@ double running_cosine_dist( double *store ){ - double d, ww, wp, pp; + double d; unsigned int *first_qgram; unsigned int *last_qgram;