Skip to content

Commit

Permalink
fixed assymetry in jw-distance. closes #42
Browse files Browse the repository at this point in the history
  • Loading branch information
markvanderloo committed Oct 26, 2015
2 parents 6242dc7 + 837755e commit 27cd1a5
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 64 deletions.
2 changes: 1 addition & 1 deletion build/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Description: Implements an approximate string matching version of R's native
An implementation of soundex is provided as well. Distances can be computed
between character vectors while taking proper care of encoding or between
integer vectors representing generic sequences.
Version: 0.9.3
Version: 0.9.4
Depends: R (>= 2.15.3)
Imports: parallel
URL: https://github.com/markvanderloo/stringdist
Expand Down
3 changes: 2 additions & 1 deletion pkg/NEWS
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
version 0.9.4
- fixed edge case for zero-size for lower tridiagonal dist matrices (caused UBSAN to fire, but gave correct results).
- bugfix: edge case for zero-size for lower tridiagonal dist matrices (caused UBSAN to fire, but gave correct results).
- bugfix in jw distance: not symmetric for certain cases (thanks to github user gtumuluri)

version 0.9.3
- new function for tokenizing integer sequences: seq_qgrams
Expand Down
116 changes: 55 additions & 61 deletions pkg/src/jaro.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,28 +24,6 @@
#endif



/* First match of a in b[]
* Returns -1 if no match is found
* Parameter 'guard; indicates which elements of b have been matched before to avoid
* matching two instances of the same character to the same position in b (which we treat read-only).
*/
static int match_int(unsigned int a, unsigned int *b, double *guard, int width, int m){
int i = 0;
while (
( i < width ) &&
( b[i] != a || (b[i] == a && guard[i] > 0))
){
++i;
}
// ugly edge case workaround
if ( !(m && i==width) && b[i] == a ){
guard[i] = 1.0;
return i;
}
return -1;
}

// Winkler's l-factor (nr of matching characters at beginning of the string).
static double get_l(unsigned int *a, unsigned int *b, int n){
int i=0;
Expand All @@ -65,59 +43,72 @@ static double get_l(unsigned int *a, unsigned int *b, int n){
* x : length of a (in uints)
* y : length of b (in uints)
* p : Winkler's p-factor in (0,0.25)
* work : workspace, minimally of length max(x,y)
* work : workspace, minimally of length x + y
*
*/
double jaro_winkler_dist(
unsigned int *a,
int x,
unsigned int *b,
int y,
double p,
double *w,
double *work
){
unsigned int *a
, int x
, unsigned int *b
, int y
, double p
, double *w
, double *work
){


// edge case
if ( x == 0 && y == 0 ) return 0;
// swap arguments if necessary, so we always loop over the shortest string
if ( x > y ){
unsigned int *c = b;
unsigned int z = y;
b = a;
a = c;
y = x;
x = z;
}

for (int k=0; k<MAX(x,y); k++) work[k] = 0.0;
for (int k=0; k < x + y; k++) work[k] = 0;

// we need space for integers (or do a lot of conversions)
unsigned int *wrk = (unsigned int*) work;
unsigned int *matcha = wrk
, *matchb = wrk + x;
unsigned int left, right;

// number of matches
int m = 0;
// max transposition distance
int M = MAX(MAX(x,y)/2 - 1,0);
// transposition counter
double t = 0.0;
// number of matches
double m = 0.0;
int max_reached;
int left, right, J, jmax=0;

for ( int i=0; i < x; ++i ){
left = MAX(0, i-M);
if ( left >= y ){
J = -1;
} else {
right = MIN(y, i+M);
// ugly workaround: I should rewrite match_int.
max_reached = (right == y) ? 1 : 0;
J = match_int(a[i], b + left, work + left, right - left, max_reached);

// store the match indices. Indices are stored as i+1 because 0 is used as 'no match'
for ( int i = 0; i < x; ++i){
left = MAX(0,i-M);
right = MIN(y,i+M);
for ( int j = left; j <= right; j++){
if ((a[i] == b[j]) && (matchb[j]==0)){
matcha[i] = i+1;
matchb[j] = j+1;
m += 1;
break;
}
}
}

if ( J >= 0 ){
++m;
t += (J + left < jmax ) ? 1 : 0;
jmax = MAX(jmax, J + left);
// copy matches so they're easy to compare for transposition counting
int j = 0;
for (int i=0; i < x; ++i){
if (matcha[i]){
matcha[j] = a[matcha[i]-1];
++j;
}
}
j = 0;
for (int i=0; i < y; ++i){
if (matchb[i]){
matchb[j] = b[matchb[i]-1];
++j;
}
}

// count 'transpositions', the Jaro way.
double t = 0.0;
for ( int k=0; k<m; ++k){
if (matcha[k] != matchb[k]) t += 0.5;
}

double d;
if ( m < 1 ){
d = 1.0;
Expand All @@ -130,8 +121,11 @@ double jaro_winkler_dist(
int n = MIN(MIN(x,y),4);
d = d - get_l(a,b,n)*p*d;
}

return d;
}





4 changes: 3 additions & 1 deletion pkg/src/stringdist.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,9 @@ Stringdist *open_stringdist(Distance d, int str_len_a, int str_len_b, ...){
S->tree = new_qtree(S->q, 2L);
break;
case jw :
S->work = (double *) malloc( sizeof(double) * MAX(str_len_a,str_len_b));
// S->work = (double *) malloc( sizeof(double) * MAX(str_len_a,str_len_b));
S->work = (double *) malloc( sizeof(double) * (str_len_a+str_len_b));

S->weight = (double *) malloc(3L*sizeof(double));
memcpy(S->weight, va_arg(args, double *), 3*sizeof(double));
S->p = va_arg(args, double);
Expand Down
7 changes: 7 additions & 0 deletions pkg/tests/testthat/testStringdist.R
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,13 @@ test_that("basic examples and edge cases work",{
# following test added after a bug report of Carol Gan:
expect_equal(stringdist("tire","tree",method="jw"),stringdist("tree","tire",method="jw"));
expect_equal(sum(is.na(stringdist(c("a", NA, "b", "c"), c("aa", "bb", "cc", "dd"),method="jw"))),1)
# following test added after issue #42 by github user gtumuluri
# thanks to Jan for providing this simple example.
expect_equal(stringdist("DHCXXXXX","HCDXXXXX",method="jw"),stringdist("HCDXXXXX","DHCXXXXX",method="jw"))
# the following test added after issue #42, comment by desource90
expect_equal(
stringdist("RICK WARREN","WARREN BUFFET",method="jw")
, 1 - (1/3)*(7/13 + 7/11 + (7-3.5)/7))
})

test_that("Extended examples work",{
Expand Down

0 comments on commit 27cd1a5

Please sign in to comment.