Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/).
## [Unreleased]

### Fixed
- `Diff` now sorts matches by `Similarity` descending before selecting the overall
suggestion. Previously, `KeywordSearch` ordered candidates by token overlap score,
so a high-keyword-score candidate classified as ADD could mask a lower-keyword-score
candidate with higher Jaccard similarity that should have been UPDATE or DUPLICATE.
- Deduplication false positives on scientific and domain-specific text:
- Removed bare `"not"` from negation words — it appears in virtually all
scientific prose and caused unrelated records to be classified as CONFLICT.
Expand Down
8 changes: 8 additions & 0 deletions internal/search/diff.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,14 @@ func Diff(insights []*model.Insight, newContent string, opts DiffOptions) DiffRe
}
}

// Sort by similarity descending so matches[0] is always the strongest candidate.
// KeywordSearch orders by token overlap score, which can differ from the final
// Jaccard-based Similarity — a high-keyword-score ADD would otherwise mask a
// lower-keyword-score UPDATE or DUPLICATE from a more similar candidate.
sort.Slice(matches, func(i, j int) bool {
return matches[i].Similarity > matches[j].Similarity
})

// Overall suggestion: take the strongest match
overall := DiffAdd
if len(matches) > 0 {
Expand Down
26 changes: 26 additions & 0 deletions internal/search/diff_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,29 @@ func TestDiff_LimitDefault(t *testing.T) {
t.Errorf("default limit 5: got %d matches", len(result.Matches))
}
}

func TestDiff_LowerKeywordScoreUpdateNotMasked(t *testing.T) {
// insightA: all of new's tokens are present (keyword score = 5/5 = 1.0),
// but Jaccard = 5/14 ≈ 0.36 → ADD. KeywordSearch puts this first.
insightA := &model.Insight{
ID: "a",
Content: "project uses redis for caching database monitoring alerting logging tracing scaling replication failover clustering sharding",
}
// insightB: keyword score = 4/5 = 0.8 (ranks second), Jaccard = 4/6 ≈ 0.67 → UPDATE.
// Without sorting by Similarity, insightA's ADD masks this UPDATE.
insightB := &model.Insight{
ID: "b",
Content: "project uses redis postgresql caching",
}

result := Diff(
[]*model.Insight{insightA, insightB},
"project uses redis for caching database",
DiffOptions{},
)

if result.Suggestion != DiffUpdate {
t.Errorf("want UPDATE (insightB is more similar by Jaccard), got %s — "+
"high-keyword-score ADD from insightA masked the UPDATE", result.Suggestion)
}
}
Loading