From 93d0d4c34be05a93c927c85a652027b66e7e1caa Mon Sep 17 00:00:00 2001 From: mark Date: Mon, 26 Oct 2015 17:21:41 +0100 Subject: [PATCH] solved some weight bugs --- pkg/src/dl.c | 8 ++++---- pkg/src/lv.c | 9 ++++++--- pkg/src/osa.c | 4 ++-- pkg/tests/testthat/testStringdist.R | 8 ++++++++ 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/pkg/src/dl.c b/pkg/src/dl.c index 531ab8c..17005a9 100644 --- a/pkg/src/dl.c +++ b/pkg/src/dl.c @@ -119,12 +119,12 @@ double dl_dist( unsigned int swapCount, targetCharCount,i,j; double delScore, insScore, subScore, swapScore; - unsigned int score_ceil = x + y; + double score_ceil = x + y; /* intialize matrix start values */ scores[0] = score_ceil; - scores[1 * (y + 2) + 0] = score_ceil; - scores[0 * (y + 2) + 1] = score_ceil; + scores[1 * (y + 2) + 0] = weight[0]; //score_ceil; + scores[0 * (y + 2) + 1] = weight[1]; //score_ceil; scores[1 * (y + 2) + 1] = 0; uniquePush(dict,src[0]); @@ -142,7 +142,7 @@ double dl_dist( for(j=1;j<=y;j++){ if(i == 1) { uniquePush(dict,tgt[j]); - scores[1 * (y + 2) + (j + 1)] = j * weight[0]; + scores[1 * (y + 2) + (j + 1)] = j * weight[1]; scores[0 * (y + 2) + (j + 1)] = score_ceil; } targetCharCount = dict->value[which(dict, tgt[j-1])]; diff --git a/pkg/src/lv.c b/pkg/src/lv.c index f013e44..5edc43f 100644 --- a/pkg/src/lv.c +++ b/pkg/src/lv.c @@ -22,6 +22,7 @@ #include #endif + /* Levenshtein distance * Computes Levenshtein distance * - Simplified from restricted DL pseudocode at http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance @@ -44,24 +45,26 @@ double lv_dist( double sub; for ( i = 0; i < I; ++i ){ - scores[i] = i * weight[1]; + scores[i] = i * weight[0]; } for ( j = 1; j < J; ++j, L += I ){ - scores[L] = j * weight[0]; + scores[L] = j * weight[1]; } + int M; for ( i = 1; i <= na; ++i ){ L = I; M= 0; for ( j = 1; j <= nb; ++j, L += I, M += I ){ sub = (a[i-1] == b[j-1]) ? 0 : weight[2]; - scores[i + L] = MIN(MIN( + scores[i + I*j] = MIN(MIN( scores[i-1 + L] + weight[0], // deletion scores[i + M] + weight[1]), // insertion scores[i-1 + M] + sub // substitution ); } } + double score = scores[I*J-1]; return score; } diff --git a/pkg/src/osa.c b/pkg/src/osa.c index 8f67d7d..5eaf66c 100644 --- a/pkg/src/osa.c +++ b/pkg/src/osa.c @@ -41,10 +41,10 @@ double osa_dist(unsigned int *a, int na, unsigned int *b, int nb, double *weight double sub, tran; for ( i = 0; i < I; ++i ){ - scores[i] = i * weight[1]; + scores[i] = i * weight[0]; } for ( j = 1; j < J; ++j, L += I ){ - scores[L] = j * weight[0]; + scores[L] = j * weight[1]; } for ( i = 1; i <= na; ++i ){ diff --git a/pkg/tests/testthat/testStringdist.R b/pkg/tests/testthat/testStringdist.R index 5a3b333..9ade8d7 100644 --- a/pkg/tests/testthat/testStringdist.R +++ b/pkg/tests/testthat/testStringdist.R @@ -90,6 +90,14 @@ test_that("weights are handled correctly",{ expect_equal( stringdist("ABC", "BC", method = "dl", weight = c(i=.1, d=.1, s=1,t=.1)),.1 ) + # examples from the paper; Tanks to Nathalia Potocka for reporting. + expect_equal(stringdist("leia","leela",method="lv",weight=c(i=.1,d=1,s=1)),1.1) + expect_equal(stringdist("leia","leela",method="lv",weight=c(i=1,d=.1,s=1)),2) + expect_equal(stringdist("a","b",method="lv",weight=c(i=.1,d=1,s=.3)),.3) + expect_equal(stringdist("a","b",method="osa",weight=c(i=.1,d=1,s=.3,1)),.3) + expect_equal(stringdist("a","b",method="dl",weight=c(i=.1,d=1,s=.3,t=1)),.3) + expect_equal(stringdist("leia","leela",method="dl",weight=c(i=1,d=.1,s=1,t=1)),2) + }) test_that("NA's are handled correctly",{