Skip to content

Commit

Permalink
paralellized "afind"
Browse files Browse the repository at this point in the history
  • Loading branch information
markvanderloo committed Jul 14, 2020
1 parent a21d0b8 commit a39b935
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 61 deletions.
39 changes: 28 additions & 11 deletions pkg/R/afind.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,23 @@
#' @param x \code{[character]} strings to search in
#' @param pattern \code{[character]} strings to find (not a regular expression).
#' @param window \code{[integer]} width of moving window
#' @param value \code{[logical]} toggle return matrix with matched strings.
#' @inheritParams amatch
#'
#' @details
#' Matching is case-sensitive. Both \code{x} and \code{pattern} are converted
#' to \code{UTF-8} prior to search, unless \code{useBytes=TRUE}, in which case
#' the distances are measured bytewise.
#'
#' Code is parallelized over the \code{x} variable: each value of \code{x}
#' is scanned for every element in \code{pattern} using a separate thread (when \code{nthread}
#' is larger then 1).
#'
#' The current implementation is naive, in the sense that for each string
#' \code{s} in \code{x}, \code{nchar(s) - window + 1} separate distances are
#' computed. At the moment no attempt is made to speed up the calculation by
#' using that consecutive windows overlap.
#'
#'
#' @return
#' A \code{list} of three matrices, each of with \code{length(x)} rows and \code{length(pattern)}
Expand All @@ -27,7 +37,7 @@
#' \item{\code{distance}. \code{[character]}, the string distance between pattern and
#' the best matching window.}
#' \item{\code{match}. \code{[character]}, the first, best matching window.}
#'
#'
#' }
#'
#' @family matching
Expand All @@ -44,12 +54,15 @@
#'
#' @export
afind <- function(x, pattern, window=nchar(enc2utf8(pattern))
, value=TRUE
, method = c("osa","lv","dl","hamming","lcs", "qgram","cosine","jaccard","jw","soundex")
, useBytes = FALSE
, weight=c(d=1,i=1,s=1,t=1)
, q = 1
, p = 0
, bt = 0){
, bt = 0
, nthread = getOption("sd_num_thread")
){

stopifnot(
all(is.finite(weight))
Expand All @@ -62,6 +75,7 @@ afind <- function(x, pattern, window=nchar(enc2utf8(pattern))
, is.logical(useBytes)
, ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE)
, ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE)
, nthread > 0
)
x <- as.character(x)
pattern <- as.character(pattern)
Expand All @@ -80,26 +94,29 @@ afind <- function(x, pattern, window=nchar(enc2utf8(pattern))
stop(sprintf("method '%s' is not defined",method))
}

L <- .Call("R_afind", x, pattern
L <- .Call("R_afind"
, x
, pattern
, as.integer(window)
, method
, as.double(weight)
, as.double(p)
, as.double(bt)
, as.integer(q)
, as.integer(useBytes)
, as.integer(nthread)
, PACKAGE="stringdist")

names(L) <- c("location", "distance")

matches = sapply(seq_along(pattern), function(i){
substr(x, L[[1]][,i], L[[1]][,i] + window[i]-1)
})

list(location = L[[1]]
, distance = L[[2]]
, match = matrix(matches,nrow=length(x))
)
if (isTRUE(value)){
matches = sapply(seq_along(pattern), function(i){
substr(x, L[[1]][,i], L[[1]][,i] + window[i]-1)
})
L$match <- matrix(matches, nrow=length(x))
}

L
}


Expand Down
8 changes: 7 additions & 1 deletion pkg/inst/tinytest/test_afind.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

options(sd_num_thread=1L)

texts = c("When I grow up, I want to be"
, "one of the harversters of the sea"
Expand Down Expand Up @@ -36,7 +36,13 @@ expect_equal(out$location, location)
expect_equal(out$distance, distance)
expect_equal(out$match, match)

# test paralellization

out1 <- afind(texts, patterns, method="osa", nthread=2L)
expect_identical(out, out1)

# test option

out2 <- afind(texts, patterns, value=FALSE)
expect_equal(length(out2), 2)

4 changes: 2 additions & 2 deletions pkg/src/R_register_native.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
*/

/* .Call calls */
extern SEXP R_afind(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP R_afind(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP R_all_int(SEXP);
extern SEXP R_amatch(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP R_get_qgrams(SEXP, SEXP);
Expand All @@ -18,7 +18,7 @@ extern SEXP R_soundex(SEXP, SEXP);
extern SEXP R_stringdist(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);

static const R_CallMethodDef CallEntries[] = {
{"R_afind", (DL_FUNC) &R_afind, 9},
{"R_afind", (DL_FUNC) &R_afind, 10},
{"R_all_int", (DL_FUNC) &R_all_int, 1},
{"R_amatch", (DL_FUNC) &R_amatch, 12},
{"R_get_qgrams", (DL_FUNC) &R_get_qgrams, 2},
Expand Down
112 changes: 65 additions & 47 deletions pkg/src/Rstringdist.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))

// TODO: catch error and report.

static Stringdist *R_open_stringdist(Distance d, int max_len_a, int max_len_b, SEXP weight, SEXP p, SEXP bt, SEXP q){

Stringdist *sd = NULL;
Expand Down Expand Up @@ -333,7 +333,7 @@ SEXP R_lower_tri(SEXP a, SEXP method
// the best match with 'pattern'.
SEXP R_afind(SEXP a, SEXP pattern, SEXP width
, SEXP method, SEXP weight, SEXP p, SEXP bt
, SEXP q, SEXP useBytes)
, SEXP q, SEXP useBytes, SEXP nthrd)
{

int na = length(a) // nr of texts to search
Expand Down Expand Up @@ -371,58 +371,76 @@ SEXP R_afind(SEXP a, SEXP pattern, SEXP width
max_window = window[i];
}
}
Stringdist *sd = R_open_stringdist( (Distance) INTEGER(method)[0]
, max_window, ml_b
, weight
, p
, bt
, q
);

// allocate memory to store the strings
unsigned int *s = NULL, *t = NULL;
s = (unsigned int *) malloc(( 2L + ml_a + ml_b) * sizeof(int));

// t is the location of the pattern
t = s + ml_a + 1L;

int len_s, len_t, isna_s, isna_t, max_k, k_min, current_window, offset;


double d, d_min;
#ifdef _OPENMP
int nthreads = MIN(INTEGER(nthrd)[0],na);
#pragma omp parallel num_threads(nthreads) default(none) \
shared(yloc,ydist, na, npat, R_PosInf, NA_REAL, NA_INTEGER, bytes, intdist, \
method, weight, p, bt, q, ml_a, ml_b, window, max_window, a, pattern)
#endif
{ // start parallel region


Stringdist *sd = R_open_stringdist( (Distance) INTEGER(method)[0]
, max_window, ml_b
, weight
, p
, bt
, q
);

// allocate memory to store the strings
unsigned int *s = NULL, *t = NULL;
s = (unsigned int *) malloc(( 2L + ml_a + ml_b) * sizeof(int));

// t is the location of the pattern
t = s + ml_a + 1L;

int len_s, len_t, isna_s, isna_t, max_k, k_min, current_window, offset;
int ID, num_threads;

double d, d_min;

for( int j = 0; j < npat; j++){
// get pattern
get_elem(pattern, j, bytes, intdist, &len_t, &isna_t, t);
current_window = window[j];
offset = j*na;
for ( int i = 0; i < na; i++ ){

#ifdef _OPENMP
ID = omp_get_thread_num();
num_threads = omp_get_num_threads();
#endif
for ( int i = ID; i < na; i += num_threads ){
// get text to search
get_elem(a, i, bytes, intdist, &len_s, &isna_s, s);
if (isna_s || isna_t){ // something to search in, or find?
yloc[offset + i] = NA_INTEGER;
ydist[offset + i] = NA_REAL;
} else if ( current_window >= len_s ){ // is the text shorter than the pattern?
yloc[offset + i] = 1L;
ydist[offset + i] = stringdist(sd, s, len_s, t, len_t);
} else { // slide window over text and compute distances
max_k = len_s - current_window;
d_min = R_PosInf;
k_min = 0;
for (int k = 0; k <= max_k; k++){
d = stringdist(sd, s + k, current_window, t, len_t);
if ( d < d_min ){
d_min = d;
k_min = k;
} // end loop over windows
for( int j = 0; j < npat; j++){
// get pattern
get_elem(pattern, j, bytes, intdist, &len_t, &isna_t, t);
current_window = window[j];
offset = j*na;
if (isna_s || isna_t){ // something to search in, or find?
yloc[offset + i] = NA_INTEGER;
ydist[offset + i] = NA_REAL;
} else if ( current_window >= len_s ){ // is the text shorter than the pattern?
yloc[offset + i] = 1L;
ydist[offset + i] = stringdist(sd, s, len_s, t, len_t);
} else { // slide window over text and compute distances
max_k = len_s - current_window;
d_min = R_PosInf;
k_min = 0;
for (int k = 0; k <= max_k; k++){
d = stringdist(sd, s + k, current_window, t, len_t);
if ( d < d_min ){
d_min = d;
k_min = k;
} // end loop over windows
}
yloc[offset + i] = k_min + 1;
ydist[offset + i] = d_min;
}
yloc[offset + i] = k_min + 1;
ydist[offset + i] = d_min;
}
} // end loop over strings
} // end loop over patterns.
} // end loop over strings
} // end loop over patterns.

close_stringdist(sd);
} // end parallel region

close_stringdist(sd);
UNPROTECT(1);
return(out_list);

Expand Down

0 comments on commit a39b935

Please sign in to comment.