Skip to content

Commit

Permalink
Merge 6874d59 into dc4157a
Browse files Browse the repository at this point in the history
  • Loading branch information
Johannes Gruber committed Nov 2, 2019
2 parents dc4157a + 6874d59 commit a7fd81a
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 7 deletions.
1 change: 1 addition & 0 deletions pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ export(seq_sim)
export(stringdist)
export(stringdistmatrix)
export(stringsim)
export(stringsimmatrix)
importFrom(parallel,detectCores)
useDynLib(stringdist, .registration=TRUE)
38 changes: 31 additions & 7 deletions pkg/R/stringsim.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#' \code{stringsim} computes pairwise string similarities between elements of
#' \code{character} vectors \code{a} and \code{b}, where the vector with less
#' elements is recycled.
#' \code{stringsimmatrix} computes the string similarity matrix with rows
#' according to \code{a} and columns according to \code{b}.
#'
#' @param a R object (target); will be converted by \code{as.character}.
#' @param b R object (source); will be converted by \code{as.character}.
Expand All @@ -11,14 +13,16 @@
#' @param useBytes Perform byte-wise comparison, see \code{\link{stringdist-encoding}}.
#' @param q Size of the \eqn{q}-gram; must be nonnegative. Only applies to
#' \code{method='qgram'}, \code{'jaccard'} or \code{'cosine'}.
#' @param ... additional arguments are passed on to \code{\link{stringdist}}.
#'
#' @param ... additional arguments are passed on to \code{\link{stringdist}} and
#' \code{\link{stringdistmatrix}} respectively.
#' @return
#' Returns a vector with similarities, which are values between 0 and 1 where
#' 1 corresponds to perfect similarity (distance 0) and 0 to complete
#' dissimilarity. \code{NA} is returned when \code{\link{stringdist}} returns
#' \code{NA}. Distances equal to \code{Inf} are truncated to a similarity of
#' 0.
#' \code{stringsim} returns a vector with similarities, which are values between
#' 0 and 1 where 1 corresponds to perfect similarity (distance 0) and 0 to
#' complete dissimilarity. \code{NA} is returned when \code{\link{stringdist}}
#' returns \code{NA}. Distances equal to \code{Inf} are truncated to a
#' similarity of 0. \code{stringsimmatrix} works the same way but, equivalent to
#' \code{\link{stringdistmatrix}}, returns a similarity matrix instead of a
#' vector.
#'
#' @details
#' The similarity is calculated by first calculating the distance using
Expand All @@ -45,6 +49,24 @@ stringsim <- function(a, b, method = c("osa", "lv", "dl", "hamming", "lcs",
}


#' @rdname stringsim
#' @export
#' @rdname stringsim
stringsimmatrix <- function(a, b, method = c("osa", "lv", "dl", "hamming", "lcs",
"qgram", "cosine", "jaccard", "jw", "soundex"), useBytes=FALSE, q = 1, ...) {
# Calculate the distance
method <- match.arg(method)
nctype <- if (useBytes) "bytes" else "char"
if (missing(b)){
dist <- stringdist::stringdistmatrix(a, method=method, useBytes=useBytes, q=q, ...)
normalize_dist(dist, a, b = a, method=method, nctype=nctype, q=q)
} else {
dist <- stringdist::stringdistmatrix(a, b, method=method, useBytes=useBytes, q=q, ...)
normalize_dist(dist, a, b, method=method, nctype=nctype, q=q)
}
}


#' Compute similarity scores between sequences of integers
#'
#' @param a \code{list} of \code{integer} vectors (target)
Expand Down Expand Up @@ -91,6 +113,8 @@ lengths.list <- function(x,...){

normalize_dist <- function(dist, a, b, method, nctype="char",q=1L){

if (class(dist) == "dist") dist <- as.matrix(dist)

# Normalise the distance by dividing it by the maximum possible distance
if (method == "hamming") {
max_dist <- if (length(b) > length(a)) lengths(b,type=nctype) else lengths(a,type=nctype)
Expand Down
12 changes: 12 additions & 0 deletions pkg/inst/tinytest/test_stringsim.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,18 @@ for (method in methods[6:8]){
expect_equal(stringsim(x,y,method="jw", useBytes=FALSE), 1-1/3)
expect_equal(stringsim(x,y,method="jw", useBytes=TRUE ), (1/2 + 1/3 +1)/3)

# stringsimmatrix
x <- names(islands)[1:10]
y <- rev(x) # o-umlaut
expect_equal(class(stringsimmatrix(x,y,method="osa", useBytes=FALSE)), "matrix")
expect_equal(dim(stringsimmatrix(x,y,method="osa", useBytes=FALSE)), c(10, 10))
expect_equal(stringsimmatrix(x,y,method="osa", useBytes=FALSE)[2, 2], 0.2)
expect_equal(class(stringsimmatrix(x,method="osa", useBytes=FALSE)), "matrix")
expect_equal(dim(stringsimmatrix(x,method="osa", useBytes=FALSE)), c(10, 10))
expect_equal(stringsimmatrix(x,method="osa", useBytes=FALSE)[2, 9], 0.2)
expect_warning(stringdistmatrix(list('a')))
expect_warning(stringdistmatrix(list('a'),list('b')))

## seq_sim

# We used to have list(1:3, 2:4) and list(1:3). This occasionally
Expand Down

0 comments on commit a7fd81a

Please sign in to comment.