diff --git a/pkg/DESCRIPTION b/pkg/DESCRIPTION index d338ca0..7c4026b 100644 --- a/pkg/DESCRIPTION +++ b/pkg/DESCRIPTION @@ -21,7 +21,7 @@ Imports: parallel URL: https://github.com/markvanderloo/stringdist BugReports: https://github.com/markvanderloo/stringdist/issues -Date: 2015-12-30 +Date: 2016-09-09 Suggests: testthat RoxygenNote: 5.0.1 diff --git a/pkg/NEWS b/pkg/NEWS index fe2499a..ba6ba5b 100644 --- a/pkg/NEWS +++ b/pkg/NEWS @@ -3,6 +3,8 @@ version 0.9.4.2 (thanks to Max Fritsche) - bugfix in stringdistmatrix(a): Would segfault on q-gram w/input > ~7k strings and q>1 (thanks to Connor McKay) +- bugfix in jaccard distance: distance not always correct when passing multiple + strings (thanks to Robert Carlson) version 0.9.4.1 - stringdistmatrix(a) now outputs long vectors (issue #45, thanks to Wouter diff --git a/pkg/src/Rstringdist.c b/pkg/src/Rstringdist.c index 0a1bcad..0a91fc7 100644 --- a/pkg/src/Rstringdist.c +++ b/pkg/src/Rstringdist.c @@ -205,9 +205,13 @@ SEXP R_amatch(SEXP x, SEXP table, SEXP method len_T = T->str_len[j]; if (len_X != NA_INTEGER && len_T != NA_INTEGER ){ // both are char (usual case) d = stringdist(sd, str, len_X, *tab, len_T); +//Rprintf("d = %8.4f ",d); if ( d <= maxDist && d < d1){ index = j + 1; - if ( ABS(d) < 1e-14 ) break; // exact match + if ( fabs(d) < 1e-14 ){ + // Rprintf(" helleu!\n"); + break; // exact match + } d1 = d; } } else if ( len_X == NA_INTEGER && len_T == NA_INTEGER ) { // both are NA diff --git a/pkg/src/qgram.c b/pkg/src/qgram.c index 2b4c01d..12b71d5 100644 --- a/pkg/src/qgram.c +++ b/pkg/src/qgram.c @@ -305,7 +305,9 @@ static void getjaccard(qtree *Q, double *d){ ++d[0]; } // denominator: |x V y| - ++d[1]; + if ( Q->n[0] > 0 || Q->n[1] > 0){ + ++d[1]; + } // clean up and continue Q->n[0] = 0; Q->n[1] = 0; @@ -313,7 +315,18 @@ static void getjaccard(qtree *Q, double *d){ getjaccard(Q->right,d); } - +/* for testing purposes only +static void print_qtree(qtree *Q, int q){ + if (Q==NULL) return; + Rprintf("q=%d ",q); + Rprintf("qgram = {"); + for(int i = 0; i < q; i++) + Rprintf("%03d ",Q->qgram[i]); + Rprintf("}"); + Rprintf("n = [%2.0f %2.0f]\n", Q->n[0], Q->n[1]); + print_qtree(Q->left,q); + print_qtree(Q->right,q); +}*/ /*Get qgram distances * Input @@ -356,7 +369,8 @@ double qgram_dist( *Qp = push_string(t, y, q, *Qp, 1, 2); if (*Qp == NULL) return -2.0; - qtree *Q = *Qp; + + qtree *Q = *Qp; switch ( distance ){ case 0: getdist(Q,dist); @@ -373,7 +387,7 @@ double qgram_dist( } break; case 2: - getjaccard(Q,dist); + getjaccard(*Qp,dist); dist[0] = 1.0 - dist[0]/dist[1]; break; default: diff --git a/pkg/src/stringdist.c b/pkg/src/stringdist.c index fc8a579..6c745c9 100644 --- a/pkg/src/stringdist.c +++ b/pkg/src/stringdist.c @@ -124,12 +124,11 @@ double stringdist(Stringdist *S, unsigned int *str_a, int len_a, unsigned int *s case lcs : return lcs_dist(str_a, len_a, str_b, len_b, S->work); case qgram : - return qgram_dist(str_a, len_a, str_b, len_b, S->q, &(S->tree), 0L); + return qgram_dist(str_a, len_a, str_b, len_b, S->q, &S->tree, 0L); case cosine : return qgram_dist(str_a, len_a, str_b, len_b, S->q, &S->tree, 1L); case jaccard : - d = qgram_dist(str_a, len_a, str_b, len_b, S->q, &S->tree, 2L); - break; + return qgram_dist(str_a, len_a, str_b, len_b, S->q, &S->tree, 2L); case jw : return jaro_winkler_dist(str_a, len_a, str_b, len_b, S->p, S->weight, S->work); case soundex :