Skip to content

Commit

Permalink
fix for issue #42
Browse files Browse the repository at this point in the history
  • Loading branch information
markvanderloo committed Oct 19, 2015
1 parent c3701fd commit 1336f3d
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 62 deletions.
3 changes: 3 additions & 0 deletions pkg/NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
version 0.9.4
- bugfix in jw distance: not symmetric for certain cases (thanks to github user gtumuluri)

version 0.9.3
- new function for tokenizing integer sequences: seq_qgrams
- new function for matching integer sequences: seq_amatch
Expand Down
133 changes: 72 additions & 61 deletions pkg/src/jaro.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,28 +24,6 @@
#endif



/* First match of a in b[]
* Returns -1 if no match is found
* Parameter 'guard; indicates which elements of b have been matched before to avoid
* matching two instances of the same character to the same position in b (which we treat read-only).
*/
static int match_int(unsigned int a, unsigned int *b, double *guard, int width, int m){
int i = 0;
while (
( i < width ) &&
( b[i] != a || (b[i] == a && guard[i] > 0))
){
++i;
}
// ugly edge case workaround
if ( !(m && i==width) && b[i] == a ){
guard[i] = 1.0;
return i;
}
return -1;
}

// Winkler's l-factor (nr of matching characters at beginning of the string).
static double get_l(unsigned int *a, unsigned int *b, int n){
int i=0;
Expand All @@ -65,59 +43,68 @@ static double get_l(unsigned int *a, unsigned int *b, int n){
* x : length of a (in uints)
* y : length of b (in uints)
* p : Winkler's p-factor in (0,0.25)
* work : workspace, minimally of length max(x,y)
* work : workspace, minimally of length x + y
*
*/
double jaro_winkler_dist(
unsigned int *a,
int x,
unsigned int *b,
int y,
double p,
double *w,
double *work
){
unsigned int *a
, int x
, unsigned int *b
, int y
, double p
, double *w
, double *work
){


// edge case
if ( x == 0 && y == 0 ) return 0;
// swap arguments if necessary, so we always loop over the shortest string
if ( x > y ){
unsigned int *c = b;
unsigned int z = y;
b = a;
a = c;
y = x;
x = z;
}

for (int k=0; k<MAX(x,y); k++) work[k] = 0.0;
//unsigned int *work = (unsigned int *) malloc((x + y)*sizeof(unsigned int));
for (int k=0; k < x + y; k++) work[k] = 0;
//
double *matcha = work
, *matchb = work + x;
unsigned int left, right;

// number of matches
int m = 0;
// max transposition distance
int M = MAX(MAX(x,y)/2 - 1,0);
// transposition counter
double t = 0.0;
// number of matches
double m = 0.0;
int max_reached;
int left, right, J, jmax=0;

for ( int i=0; i < x; ++i ){
left = MAX(0, i-M);
if ( left >= y ){
J = -1;
} else {
right = MIN(y, i+M);
// ugly workaround: I should rewrite match_int.
max_reached = (right == y) ? 1 : 0;
J = match_int(a[i], b + left, work + left, right - left, max_reached);


for ( int i = 0; i < x; ++i){
left = MAX(0,i-M);
right = MIN(y,i+M);
for ( int j = left; j <= right; j++){
if (a[i] == b[j] & matchb[j]==0){
matcha[i] = i+1;
matchb[j] = j+1;
m += 1;
break;
}
}
}

if ( J >= 0 ){
++m;
t += (J + left < jmax ) ? 1 : 0;
jmax = MAX(jmax, J + left);
double t = 0.0;
int j = 0;
for (int i=0; i < x; ++i){
if (matcha[i]){
matcha[j] = (double) a[(int) (matcha[i]-1)];
++j;
}
}
j = 0;
for (int i=0; i < y; ++i){
if (matchb[i]){
matchb[j] = (double) b[(int) (matchb[i]-1)];
++j;
}
}
for ( int k=0; k<m; ++k){
t += (matcha[k] == matchb[k]) ? 0 : 0.5;
}

double d;
if ( m < 1 ){
d = 1.0;
Expand All @@ -130,8 +117,32 @@ double jaro_winkler_dist(
int n = MIN(MIN(x,y),4);
d = d - get_l(a,b,n)*p*d;
}

return d;
}

/*
SEXP jwdist(SEXP a, SEXP x, SEXP b, SEXP y, SEXP p, SEXP w){
double *work = (double *) malloc((INTEGER(x)[0]+INTEGER(y)[0])*sizeof(double *));
SEXP out;
out = PROTECT(allocVector(REALSXP, 1));
REAL(out)[0] = jaro_winkler_dist(
(unsigned int *) INTEGER(a)
,INTEGER(x)[0]
,(unsigned int *) INTEGER(b)
,INTEGER(y)[0]
,REAL(p)[0]
,REAL(w)
, work
);
free(work);
UNPROTECT(1);
return out ;
}
*/




4 changes: 3 additions & 1 deletion pkg/src/stringdist.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,9 @@ Stringdist *open_stringdist(Distance d, int str_len_a, int str_len_b, ...){
S->tree = new_qtree(S->q, 2L);
break;
case jw :
S->work = (double *) malloc( sizeof(double) * MAX(str_len_a,str_len_b));
// S->work = (double *) malloc( sizeof(double) * MAX(str_len_a,str_len_b));
S->work = (double *) malloc( sizeof(double) * (str_len_a+str_len_b));

S->weight = (double *) malloc(3L*sizeof(double));
memcpy(S->weight, va_arg(args, double *), 3*sizeof(double));
S->p = va_arg(args, double);
Expand Down
4 changes: 4 additions & 0 deletions pkg/tests/testthat/testStringdist.R
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,10 @@ test_that("basic examples and edge cases work",{
# following test added after a bug report of Carol Gan:
expect_equal(stringdist("tire","tree",method="jw"),stringdist("tree","tire",method="jw"));
expect_equal(sum(is.na(stringdist(c("a", NA, "b", "c"), c("aa", "bb", "cc", "dd"),method="jw"))),1)
# following test added after issue #42 by github user gtumuluri
# thanks to Jan for providing this simple example.
expect_equal(stringdist("DHCXXXXX","HCDXXXXX",method="jw"),stringdist("HCDXXXXX","DHCXXXXX",method="jw"))

})

test_that("Extended examples work",{
Expand Down

0 comments on commit 1336f3d

Please sign in to comment.