diff --git a/CHANGELOG.md b/CHANGELOG.md index 33c3cab..15ff6ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/). ## [Unreleased] ### Fixed +- `Diff` now sorts matches by `Similarity` descending before selecting the overall + suggestion. Previously, `KeywordSearch` ordered candidates by token overlap score, + so a high-keyword-score candidate classified as ADD could mask a lower-keyword-score + candidate with higher Jaccard similarity that should have been UPDATE or DUPLICATE. - Deduplication false positives on scientific and domain-specific text: - Removed bare `"not"` from negation words — it appears in virtually all scientific prose and caused unrelated records to be classified as CONFLICT. diff --git a/internal/search/diff.go b/internal/search/diff.go index 9d33c4a..c74473c 100644 --- a/internal/search/diff.go +++ b/internal/search/diff.go @@ -159,6 +159,14 @@ func Diff(insights []*model.Insight, newContent string, opts DiffOptions) DiffRe } } + // Sort by similarity descending so matches[0] is always the strongest candidate. + // KeywordSearch orders by token overlap score, which can differ from the final + // Jaccard-based Similarity — a high-keyword-score ADD would otherwise mask a + // lower-keyword-score UPDATE or DUPLICATE from a more similar candidate. + sort.Slice(matches, func(i, j int) bool { + return matches[i].Similarity > matches[j].Similarity + }) + // Overall suggestion: take the strongest match overall := DiffAdd if len(matches) > 0 { diff --git a/internal/search/diff_test.go b/internal/search/diff_test.go index b63dd72..d10ba34 100644 --- a/internal/search/diff_test.go +++ b/internal/search/diff_test.go @@ -162,3 +162,29 @@ func TestDiff_LimitDefault(t *testing.T) { t.Errorf("default limit 5: got %d matches", len(result.Matches)) } } + +func TestDiff_LowerKeywordScoreUpdateNotMasked(t *testing.T) { + // insightA: all of new's tokens are present (keyword score = 5/5 = 1.0), + // but Jaccard = 5/14 ≈ 0.36 → ADD. KeywordSearch puts this first. + insightA := &model.Insight{ + ID: "a", + Content: "project uses redis for caching database monitoring alerting logging tracing scaling replication failover clustering sharding", + } + // insightB: keyword score = 4/5 = 0.8 (ranks second), Jaccard = 4/6 ≈ 0.67 → UPDATE. + // Without sorting by Similarity, insightA's ADD masks this UPDATE. + insightB := &model.Insight{ + ID: "b", + Content: "project uses redis postgresql caching", + } + + result := Diff( + []*model.Insight{insightA, insightB}, + "project uses redis for caching database", + DiffOptions{}, + ) + + if result.Suggestion != DiffUpdate { + t.Errorf("want UPDATE (insightB is more similar by Jaccard), got %s — "+ + "high-keyword-score ADD from insightA masked the UPDATE", result.Suggestion) + } +}