diff --git a/README.md b/README.md
index b894d05e..a6305629 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,7 @@ Use Codegraph when you need fast structural answers about a repo without relying
 - Cross-file go-to-definition and find-references support across the shared source-language pipeline.
 - Deterministic agent search, bounded explanation packets, portable artifact bundles, and MCP tools across files, symbols, chunks, SQL objects, and graph neighborhoods with stable follow-up handles.
 - Semantic chunking for code and text files, including Vue and Svelte single-file component block splitting.
+- Duplicate and near-duplicate detection over indexed symbols, semantic chunks, and text chunks.
 - AST grep, public API summaries, unresolved import reports, hotspot analysis, cycle detection, and shortest dependency paths.
 - PR impact analysis and review bundles that map diffs to changed symbols, impacted code, likely tests, and graph deltas.
 - SQL language support for `.sql` files, including statement chunks, object symbols, SQL-to-SQL graph edges, SQL navigation, and statement facts.
@@ -111,6 +112,9 @@ node ./dist/cli.js graph --root . ./src --compact-json --output codegraph.json
 
 # inspect public API surface
 node ./dist/cli.js apisurface
+
+# find duplicate and near-duplicate code
+node ./dist/cli.js duplicates ./src --min-confidence medium --limit 20
 ```
 
 If you install the published CLI instead of using a source checkout, replace `node ./dist/cli.js` with `codegraph`.
@@ -190,6 +194,7 @@ The supported package import surface is the root export, `@lzehrung/codegraph`.
 ## Common workflows
 
 - Repo triage: run `codegraph inspect ./src --limit 20`, then follow with `codegraph hotspots ./src --limit 20` or `codegraph unresolved` to focus the next pass.
+- Duplicate cleanup: run `codegraph duplicates ./src --min-confidence medium` before refactors to find shared extraction candidates.
 - Symbol navigation: use `codegraph goto <file> <line> <column>` and `codegraph refs --file <file> --line <line> --col <column> --pretty` when a question is about definitions or semantic usages rather than matching strings.
 - PR review: run `codegraph impact --base origin/main --head HEAD --pretty` for a ranked map, `codegraph review --base origin/main --head HEAD --summary` for a compact reviewer handoff with actionable candidate tests, or redirect plain `review` output when a downstream tool needs the full JSON bundle.
 - Worktree review: run `codegraph impact --base HEAD --head WORKTREE --pretty` for current staged and unstaged tracked-file changes, then `codegraph review --base HEAD --head WORKTREE --summary` for a compact handoff. Use `--head STAGED` to compare `HEAD` against the current index.
diff --git a/codegraph-skill/codegraph/SKILL.md b/codegraph-skill/codegraph/SKILL.md
index 2cb7047c..50afa023 100644
--- a/codegraph-skill/codegraph/SKILL.md
+++ b/codegraph-skill/codegraph/SKILL.md
@@ -48,6 +48,7 @@ Then choose the narrowest follow-up command:
 - Review handoff: `codegraph review --base HEAD --head WORKTREE --summary`
 - Full review JSON: `codegraph review --base origin/main --head HEAD`
 - Public API: `codegraph apisurface`
+- Duplicate cleanup: `codegraph duplicates --root . ./src --min-confidence medium`
 - Chunks: `codegraph chunk <file>`
 - Artifact bundle: `codegraph artifact build --root . --out codegraph-out --json`
 - MCP server: `codegraph mcp serve --root . --stdio` or `codegraph mcp serve --root . --port 7331`
@@ -210,6 +211,12 @@ For git-provider impact and git-scoped review/index/graph commands, `WORKTREE` c
   Reports source dependency cycles; document-only link loops remain graph edges but are filtered from cycle warnings.
 - Public API surface:
   `codegraph apisurface`
+- Duplicate and near-duplicate code:
+  `codegraph duplicates --root . ./src --min-confidence medium`
+  Covers indexed symbols, semantic chunks, and text chunks.
+  A single positional directory becomes the project root unless `--root` is set.
+  Use `--include-small` for tiny helpers.
+  Use `--include-same-file` for local clone cleanup.
 - Unresolved project imports:
   `codegraph unresolved`
   Excludes graph-only document/template link edges plus known runtime/package externals: supported-language standard libraries, URL imports, and dependencies declared in nearby manifests such as `package.json`, Python, PHP, Rust, Go, Zig, Ruby, Java/Kotlin, .NET, C/C++, and Swift package manifests.
diff --git a/docs/cli.md b/docs/cli.md
index d350815e..adcc9fa2 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -147,6 +147,11 @@ codegraph chunk package.json --text --max-tokens 200
 # Override language detection and token limits
 codegraph chunk config.yaml --language yaml --min-tokens 100 --max-tokens 300
 
+# Detect duplicate and near-duplicate code units
+codegraph duplicates ./src --min-confidence medium --limit 20
+codegraph duplicates --root . ./src ./packages/app --include-same-file
+codegraph duplicates --help
+
 # Go to definition
 codegraph goto <file> <line> <column>
 
@@ -161,6 +166,15 @@ codegraph grep --query '(function_declaration name: (identifier) @name)'
 codegraph grep --pattern 'eval\(' --ignore-case
 ```
 
+`duplicates` always reports scored exact, renamed, near, and weak clone candidates as JSON.
+
+- It combines indexed symbols, semantic chunks, and text chunks.
+- It reports project-relative paths, confidence, clone type, metrics, omission counts, and pair stats.
+- A single positional directory becomes the project root unless `--root` is set.
+- Use `--root . ./src` for scoped scans with repository-relative paths.
+- Use `--include-small` for tiny helpers.
+- Use `--include-same-file` for non-overlapping clones inside one file.
+
 `search`, `explain`, `artifact`, and `mcp` each support command-specific `--help` output so agents do not have to infer their options from the top-level help. `search` is deterministic and vectorless. It returns ranked results with project-relative stable handles, rank reasons, evidence, graph neighbors, follow-up commands, result counts, per-packet limits, and omission counts. `explain` resolves file paths, symbol names, SQL object names, and search handles, including file/chunk/graph handles, into bounded packets with symbols, dependencies, reverse dependencies, references, snippets, SQL object relation facts, changed-context review tasks/candidate tests, explicit limits, omission counts, and follow-ups. Generated follow-up and suggested-question commands POSIX-shell-quote dynamic arguments when needed. SQL object names resolve by exact name first; unqualified basenames resolve only when unique, so handles or schema-qualified names are preferred. Reference and snippet omission counts are lower bounds after the bounded navigation scan reaches its cap. `artifact build` writes `codegraph.sqlite`, self-describing project-relative `graph.json`, `CODEGRAPH_REPORT.md`, `questions.json`, and `manifest.json` by default; suggested questions use unique IDs backed by stable handles when a handle is available. Use artifact flags to select a subset. `--force` permits non-empty output directories, removes recognizable stale Codegraph artifacts, preserves unrelated operator files, and refuses unrecognized reserved-name collisions. Artifact contents exclude their own output directory and linked outside-root files. `mcp serve` exposes `search`, `get_file`, `get_symbol`, `goto`, `refs`, `deps`, `rdeps`, `path`, `impact`, `review`, `query_sqlite`, and `artifact_build` over stdio by default or Streamable HTTP with `--port <number>`. HTTP serves `/mcp`, binds to `127.0.0.1` unless `--host <host>` is passed, validates the Host header, allows loopback Host headers for wildcard binds, and rejects oversized request bodies. MCP file and artifact paths are confined to `--root` after realpath resolution; tools are read-only by default, `query_sqlite` is row- and byte-bounded and rejects synthetic payload functions, and `--allow-build` enables artifact output only. `chunk` uses semantic Tree-sitter chunking for registered source and stylesheet languages, Vue and Svelte block-aware chunking for single-file components, and text chunking for JSON, YAML, and unsupported extensions. Use `--text` to force text chunking.
 
 ### Dependency analysis and diagnostics
diff --git a/docs/library-api.md b/docs/library-api.md
index 9bfdc226..c0732f43 100644
--- a/docs/library-api.md
+++ b/docs/library-api.md
@@ -213,6 +213,38 @@ See the test suites for concrete examples:
 
 The integration examples demonstrate semantic chunking with type-based filtering, text-file chunking for configuration processing, intelligent splitting of large blocks, and metadata useful for embeddings or retrieval pipelines.
 
+## Duplicate detection
+
+`findDuplicates()` scans a built `ProjectIndex` for exact, renamed, near, and weak clone candidates.
+
+- It uses indexed symbols, semantic chunks, and text chunks.
+- Results include confidence, score, clone type, metrics, omission counts, and pair stats.
+- Paths are project-relative when the index has a project root.
+
+```ts
+import { buildProjectIndex, findDuplicates } from "@lzehrung/codegraph";
+
+const root = process.cwd();
+const index = await buildProjectIndex(root);
+const duplicates = await findDuplicates(index, {
+  minConfidence: "medium",
+  limit: 20,
+});
+
+console.log(duplicates.suggestions);
+```
+
+Useful options:
+
+- `minConfidence`: `high`, `medium`, or `low`; default `medium`.
+- `includeSameFile`: report non-overlapping clones in the same file.
+- `includeSmall`: include units below the default token floor.
+- `minTokens` and `maxTokens`: tune unit and fallback chunk bounds.
+
+Tests:
+
+- `tests/duplicates.test.ts`
+
 ## Basic index building
 
 Build a full project index and use go-to-definition:
diff --git a/docs/superpowers/plans/2026-05-19-duplicate-detection.md b/docs/superpowers/plans/2026-05-19-duplicate-detection.md
new file mode 100644
index 00000000..93471df4
--- /dev/null
+++ b/docs/superpowers/plans/2026-05-19-duplicate-detection.md
@@ -0,0 +1,359 @@
+# Duplicate Detection Design
+
+## Recommendation
+
+Use a structural-first duplicate detector.
+
+- V1 core: exact region hashes, normalized text hashes, token shingles, winnowed signatures, and symbol/chunk metadata.
+- V1 optional: normalized AST shape hashes when parse trees are already available.
+- Not V1 core: model embeddings, semantic-equivalence claims, or SQLite schema changes.
+- External workflow: keep embeddings outside Codegraph through `codegraph chunk` output.
+
+This matches Codegraph's deterministic search and review model:
+
+- [Agent search plan](./2026-05-14-agent-search-artifact-mcp.md) keeps search/explain vectorless and deterministic.
+- [Agent workflows](../../agent-workflows.md) documents bounded search/explain packets.
+- [Library API](../../library-api.md) documents chunking for LLM and vector workflows.
+- [How it works](../../how-it-works.md) documents Tree-sitter, `ProjectIndex`, and content-hash caching.
+
+## Problem
+
+Duplicate detection should surface likely refactor candidates, not prove semantic equivalence.
+
+In scope:
+
+- Find exact, renamed, and near-copy regions for human or agent review.
+- Rank candidates by transparent reasons.
+- Bound runtime on large repositories.
+- Reuse discovery, chunking, symbol, graph, and impact infrastructure.
+
+Out of scope:
+
+- Type-4 semantic equivalence proof.
+- Cross-language clone detection in V1.
+- Embedding storage or model execution in core.
+- Persistent SQLite schema changes for the first implementation.
+
+## Source Anchors
+
+- [`src/chunking/chunkFile.ts`](../../../src/chunking/chunkFile.ts): semantic chunk boundaries.
+- [`src/chunking/chunkSFC.ts`](../../../src/chunking/chunkSFC.ts): single-file component block chunking.
+- [`src/indexer/types.ts`](../../../src/indexer/types.ts): `ProjectIndex`, `ModuleIndex`, and `SymbolDef` metadata.
+- [`src/agent/search.ts`](../../../src/agent/search.ts): deterministic scores, reasons, limits, and stable sort behavior.
+- [`src/graphs/grep.ts`](../../../src/graphs/grep.ts): AST and text grep capabilities.
+- [`src/impact/parse.ts`](../../../src/impact/parse.ts): git rename/copy `similarityIndex` parsing.
+- [`src/impact/types.ts`](../../../src/impact/types.ts): impact file-change metadata.
+
+## Taxonomy
+
+| Clone type | Description | V1 support | Primary signals |
+| --- | --- | --- | --- |
+| Type-1 | Exact duplicated text | Yes | Raw or normalized text hash |
+| Type-2 | Same structure with renamed identifiers or literals | Yes | AST shape hash, token shingles |
+| Type-3 | Edited copy with inserted or deleted statements | Partial | Winnowing, Jaccard, ordered-token similarity |
+| Type-4 | Semantically equivalent but structurally different | External only | Embeddings or deeper semantic analysis |
+
+Type-4 results are discovery hints when powered externally.
+
+## Approach Catalog
+
+### 1. Exact Region Hash
+
+- Inputs: source text and unit range.
+- Hooks: `ProjectIndex` symbols, semantic chunks, content-hash cache behavior.
+- Cost: O(bytes).
+- Strength: strongest Type-1 signal.
+- Weakness: misses renames and small edits.
+
+### 2. Comment and Whitespace Normalized Hash
+
+- Inputs: source text, language comment rules where available.
+- Hooks: language definitions and parser-backed comments.
+- Cost: O(bytes).
+- Strength: catches formatting drift.
+- Weakness: string-literal-safe comment stripping needs care.
+
+### 3. Symbol Metadata Prefilter
+
+- Inputs: `SymbolDef.kind`, `range`, `lineSpan`, `complexity`, docstring, export status.
+- Hooks: `src/indexer/types.ts` and locals/export extraction.
+- Cost: O(symbols).
+- Strength: reduces candidate comparisons.
+- Weakness: metadata is never proof by itself.
+
+### 4. Chunk-Body Similarity
+
+- Inputs: `Chunk` records from code, text, and SFC chunkers.
+- Hooks: `chunkFile`, `chunkTextFile`, and `chunkSFCFile`.
+- Cost: O(chunks + candidate pairs).
+- Strength: works when symbol extraction is incomplete.
+- Weakness: broad chunks can create boilerplate matches.
+
+### 5. Token N-Gram Fingerprints
+
+- Inputs: normalized tokens from each unit.
+- Hooks: symbol ranges and chunk boundaries.
+- Cost: O(tokens), plus memory for unique shingles.
+- Strength: Type-2 and lightweight Type-3 candidate generation.
+- Weakness: common boilerplate buckets need caps.
+
+### 6. Winnowing or MinHash
+
+- Inputs: token shingles.
+- Hooks: internal duplicate fingerprint index.
+- Cost: O(tokens) indexing plus candidate verification.
+- Strength: scalable near-copy detection.
+- Weakness: requires shingle, window, and threshold tuning.
+
+### 7. Normalized AST Fingerprint
+
+- Inputs: Tree-sitter parse tree and unit range.
+- Hooks: native parser and JS fallback parser paths.
+- Cost: parser-dependent and deterministic.
+- Strength: strong Type-2 signal.
+- Weakness: parity requires explicit language rules.
+
+### 8. AST Structural Grep Recurrence
+
+- Inputs: Tree-sitter queries or text patterns.
+- Hooks: `src/graphs/grep.ts`.
+- Cost: O(files) per query.
+- Strength: targeted repeated anti-patterns.
+- Weakness: not a general clone detector.
+
+### 9. Git Similarity Metadata
+
+- Inputs: git or raw diff metadata.
+- Hooks: `similarityIndex` from impact parsing.
+- Cost: free when impact already parses git diff.
+- Strength: PR-scoped copy and rename context.
+- Weakness: not whole-repo detection.
+
+### 10. External Embeddings
+
+- Inputs: `codegraph chunk` text and metadata.
+- Hooks: chunk CLI and library APIs.
+- Cost: external runtime and storage.
+- Strength: Type-4 discovery and natural-language similarity.
+- Weakness: not deterministic core proof.
+
+## Pipeline
+
+```mermaid
+flowchart LR
+  files[Discovered files] --> units[Symbols and chunks]
+  units --> exact[Exact and normalized hashes]
+  units --> tokens[Token shingles]
+  units --> ast[Optional AST shape hash]
+  exact --> buckets[Candidate buckets]
+  tokens --> buckets
+  ast --> buckets
+  buckets --> caps[Bucket and unit caps]
+  caps --> metrics[Pair metrics]
+  metrics --> score[Composite score]
+  score --> report[Bounded suggestions with reasons]
+```
+
+## Unit Collection
+
+Default strategy:
+
+- Prefer symbols for functions, methods, classes, interfaces, types, SQL routines, and SQL objects.
+- Fall back to semantic chunks when symbol coverage is incomplete.
+- Preserve both symbol and chunk identity where both exist.
+- Skip units below `minTokens`, default `40`, unless exact small matches are requested.
+- Skip or split units above `maxTokens`, default `800`.
+
+Suggested shape:
+
+```ts
+type DuplicateUnit = {
+  id: string;
+  file: string;
+  languageId: string;
+  kind: "symbol" | "chunk" | "sql" | "text";
+  name?: string;
+  symbolKind?: string;
+  startLine: number;
+  endLine: number;
+  tokenCount: number;
+  complexity?: number;
+};
+```
+
+## Fingerprints
+
+Generate cheap signals first:
+
+- `rawHash`: exact source region hash.
+- `normalizedTextHash`: comment and whitespace normalized hash.
+- `tokenShingles`: normalized token k-grams, default `k = 5`.
+- `winnowedSignature`: representative shingle hashes, default window size `4`.
+- `astShapeHash`: optional normalized AST structure hash.
+
+## Candidate Generation
+
+Avoid all-pairs comparison.
+
+- Bucket by normalized text hash, AST shape hash, and winnowed shingle hashes.
+- Cap buckets larger than `maxBucketSize`, default `200`.
+- Compare pairs with exact hash, AST hash, enough shared shingles, or PR git similarity.
+- Preserve pair identity as sorted `(leftUnitId, rightUnitId)`.
+
+## Pair Metrics
+
+For each candidate pair, compute bounded metrics:
+
+- `tokenJaccard`
+- `orderedTokenSimilarity`
+- `shingleOverlap`
+- `lengthRatio`
+- `sameSymbolKind`
+- `lineSpanRatio`
+- `astShapeEqual`
+- `sameFile`
+- `sharedDependencyContext`
+
+## Output Shape
+
+```ts
+type DuplicateSuggestion = {
+  score: number;
+  confidence: "high" | "medium" | "low";
+  cloneType: "exact" | "renamed" | "near" | "weak";
+  left: DuplicateUnitRef;
+  right: DuplicateUnitRef;
+  metrics: DuplicateMetrics;
+  reasons: string[];
+};
+```
+
+## Scoring
+
+Scores are deterministic and capped to `0..100`.
+
+Positive signals:
+
+| Signal | Weight | Reason |
+| --- | ---: | --- |
+| Raw source hash match | +60 | `raw_hash_match` |
+| Normalized text hash match | +50 | `normalized_text_hash_match` |
+| AST shape hash match | +40 | `ast_shape_match` |
+| Token Jaccard >= 0.95 | +30 | `token_jaccard_0.97` |
+| Token Jaccard >= 0.85 | +22 | `token_jaccard_0.88` |
+| Token Jaccard >= 0.70 | +12 | `token_jaccard_0.73` |
+| Shingle overlap | +0..25 | `shared_shingles_14` |
+| Ordered token similarity >= 0.80 | +10 | `ordered_similarity_0.84` |
+| Same symbol kind | +4 | `same_symbol_kind_function` |
+| Similar line span | +4 | `similar_line_span` |
+| Similar complexity | +3 | `similar_complexity` |
+| PR git similarity >= 80% | +20 | `git_similarity_92` |
+| Shared dependency context | +3 | `shared_dependency_context` |
+
+Negative signals:
+
+| Signal | Weight | Reason |
+| --- | ---: | --- |
+| Token count below threshold | -25 | `trivial_body_penalty` |
+| Length ratio outside `0.5..2.0` | -20 | `length_mismatch_penalty` |
+| Boilerplate bucket too large | -20 | `boilerplate_bucket_penalty` |
+| License/header-only region | -30 | `license_header_penalty` |
+| Generated or vendored path | -15 | `generated_path_penalty` |
+| Same file and adjacent regions | -10 | `same_file_adjacent_penalty` |
+
+Hard filters:
+
+- Ignore files excluded by discovery config or CLI ignore globs.
+- Discard overlapping ranges in the same file.
+- Discard same enclosing-symbol containment.
+- Discard below-threshold units unless exact small matching is enabled.
+- Discard oversized buckets without exact or AST evidence.
+
+Confidence tiers:
+
+- High: `score >= 80`, exact hash with `tokenJaccard >= 0.90`, or AST match with `tokenJaccard >= 0.85`.
+- Medium: `score >= 55` and `tokenJaccard >= 0.70`.
+- Low: `score >= 35`.
+
+Clone types:
+
+- `exact`: raw or normalized text hash match.
+- `renamed`: AST shape match or very high token similarity with renamed identifiers/literals.
+- `near`: strong shingle or token similarity with edits.
+- `weak`: low-threshold structural similarity.
+
+Sort order:
+
+1. Confidence rank.
+2. Score descending.
+3. Token Jaccard descending.
+4. Left file, left line, right file, right line.
+
+## Rollout
+
+### Phase 0: Design Doc
+
+- Add this plan.
+- Do not update public capability docs until implementation ships.
+
+### Phase 1: In-Memory Engine
+
+- Add unit extraction and fingerprinting helpers.
+- Implement exact hashes, normalized text hashes, shingles, and winnowing.
+- Add focused fixtures.
+- Avoid cache or SQLite persistence.
+
+### Phase 2: CLI and Library
+
+- Add `findDuplicates(index, options)`.
+- Add `codegraph duplicates`.
+- Return deterministic JSON with bounded results and omission counts.
+- Update CLI, library API, and skill docs.
+
+### Phase 3: AST Normalization
+
+- Add language-aware AST shape hashing.
+- Update language parity and scenario docs.
+- Add per-language tests.
+
+### Phase 4: PR and Agent Integration
+
+- Use git `similarityIndex` in review or impact mode.
+- Add duplicate candidates to explain packets only after output stabilizes.
+
+### Phase 5: Optional Persistence
+
+- Consider duplicate cache files only if repeated runs need them.
+- Add SQLite schema migrations only with a clear query use case.
+
+## Parity Rules
+
+- V1 compares same-language units only.
+- Source languages use symbol-first units where possible.
+- SQL can start with statement/object chunks.
+- Vue, Svelte, and Astro compare script/style/template blocks separately.
+- Graph-first formats and config files may be chunk-only.
+- Document language limitations when AST normalization arrives.
+
+## Testing
+
+Core cases:
+
+- Exact duplicate functions in two files.
+- Renamed variables and literals.
+- Near-copy with one edited branch.
+- Same-file non-overlapping duplicates.
+- Similar names with different bodies.
+- Tiny trivial helpers below threshold.
+- Generated or header boilerplate.
+- Deterministic result ordering.
+- Large repeated boilerplate buckets.
+- PR `similarityIndex` boosts changed-file suggestions.
+
+## Open Questions
+
+- Should `minTokens` default to `40`, or stay closer to the `150` chunking default?
+- Should same-file duplicates be enabled by default?
+- Should hotspot files increase score or decrease score?
+- Should PR mode compare changed regions against the whole repo or dependency neighborhoods?
+- Should embeddings remain external, or should a plugin interface accept external similarity scores?
diff --git a/src/cli.ts b/src/cli.ts
index 9212ef7e..205d8236 100644
--- a/src/cli.ts
+++ b/src/cli.ts
@@ -25,6 +25,7 @@ import {
 } from "./cli/context.js";
 import { handleArtifactCommand } from "./cli/artifact.js";
 import { buildDoctorReport } from "./cli/doctor.js";
+import { handleDuplicatesCommand } from "./cli/duplicates.js";
 import { handleExplainCommand } from "./cli/explain.js";
 import { handleGraphCommand } from "./cli/graph.js";
 import { handleGraphDeltaCommand } from "./cli/graphDelta.js";
@@ -43,11 +44,16 @@ import { handleSkillCommand } from "./cli/skill.js";
 import { handleSqlCommand } from "./cli/sql.js";
 import { hasDiscoveryOptions, loadCodegraphConfig, mergeDiscoveryOptions } from "./config.js";
 import { listChangedFiles } from "./util/git.js";
-import { listProjectFiles, type ProjectFileDiscoveryOptions } from "./util/projectFiles.js";
+import { DEFAULT_PROJECT_PATTERNS, listProjectFiles, type ProjectFileDiscoveryOptions } from "./util/projectFiles.js";
 import { normalizePath, resolveFilePathFromRoot, toProjectDisplayPath } from "./util/paths.js";
 
 export { isCliDiscoveryRelativePathInside } from "./cli/context.js";
 
+const DUPLICATE_PROJECT_PATTERNS = [
+  ...DEFAULT_PROJECT_PATTERNS,
+  "**/*.{json,jsonc,toml,txt,yaml,yml}",
+];
+
 function normalizeEntrypointPath(filePath: string): string {
   const resolvedPath = path.resolve(filePath);
   try {
@@ -151,6 +157,7 @@ async function runCliWithActiveRuntime(rawArgs: string[]) {
       cmd === "grep" ||
       cmd === "hotspots" ||
       cmd === "inspect" ||
+      cmd === "duplicates" ||
       cmd === "impact") &&
     !rootOpt &&
     parsed.positionals.length === 1 &&
@@ -236,7 +243,8 @@ async function runCliWithActiveRuntime(rawArgs: string[]) {
     ? { ...configDiscoveryOptions, globRoot: projectRootFs }
     : {};
 
-  const supportsIncludeRoots = cmd === "graph" || cmd === "index" || cmd === "hotspots" || cmd === "inspect";
+  const supportsIncludeRoots =
+    cmd === "graph" || cmd === "index" || cmd === "hotspots" || cmd === "inspect" || cmd === "duplicates";
   let includeRoots: string[] = [];
   if (supportsIncludeRoots) {
     if (rootOpt) {
@@ -256,11 +264,12 @@ async function runCliWithActiveRuntime(rawArgs: string[]) {
   };
 
   const resolveFilesFromRoots = async (): Promise<string[]> => {
-    if (!includeRootsAbs.length) return await listProjectFiles(projectRootFs, undefined, discoveryOptions);
+    const patterns = cmd === "duplicates" ? DUPLICATE_PROJECT_PATTERNS : undefined;
+    if (!includeRootsAbs.length) return await listProjectFiles(projectRootFs, patterns, discoveryOptions);
     const normalizedRoots = includeRootsAbs;
     const all: string[][] = await Promise.all(
       normalizedRoots.map(async (r) => {
-        const files = await listProjectFiles(r, undefined, {
+        const files = await listProjectFiles(r, patterns, {
           ...includeRootDiscoveryOptions,
           gitignoreRoot: projectRootFs,
         });
@@ -457,6 +466,27 @@ async function runCliWithActiveRuntime(rawArgs: string[]) {
     return;
   }
 
+  if (cmd === "duplicates") {
+    const files = await resolveFiles();
+    await handleDuplicatesCommand({
+      projectRootFs,
+      files,
+      getOpt,
+      hasFlag,
+      indexOptions: {
+        onProgress: progressHandler,
+        discovery: discoveryOptions,
+        ...(hasGraphOverrides ? { graph: buildGraphOptions() } : {}),
+        ...(nativeMode !== "auto" ? { native: nativeMode } : {}),
+        ...workerOpts,
+      },
+      writeJSONLine,
+      writeStderrLine,
+      exit: exitCli,
+    });
+    return;
+  }
+
   if (cmd === "dumpmod") {
     await handleDumpmodCommand({
       projectRootFs,
diff --git a/src/cli/duplicates.ts b/src/cli/duplicates.ts
new file mode 100644
index 00000000..88221012
--- /dev/null
+++ b/src/cli/duplicates.ts
@@ -0,0 +1,54 @@
+import { buildProjectIndexFromFiles } from "../indexer/build-index.js";
+import { findDuplicates, type DuplicateConfidence, type DuplicateDetectionOptions } from "../duplicates.js";
+import type { BuildOptions } from "../indexer/types.js";
+import { parseNonNegativeIntegerOption, parsePositiveIntegerOption } from "./options.js";
+
+export type DuplicatesCommandContext = {
+  projectRootFs: string;
+  files: string[];
+  getOpt: (name: string) => string | undefined;
+  hasFlag: (name: string) => boolean;
+  indexOptions?: BuildOptions;
+  writeJSONLine: (value: unknown) => void;
+  writeStderrLine: (message: string) => void;
+  exit: (code: number) => never;
+};
+
+function parseConfidenceOption(rawValue: string | undefined): DuplicateConfidence | undefined {
+  if (rawValue === undefined) return undefined;
+  if (rawValue === "high" || rawValue === "medium" || rawValue === "low") return rawValue;
+  throw new Error(`Invalid --min-confidence value "${rawValue}". Expected high|medium|low.`);
+}
+
+function parseDuplicateDetectionOptions(context: DuplicatesCommandContext): DuplicateDetectionOptions {
+  const minConfidence = parseConfidenceOption(context.getOpt("--min-confidence"));
+  const options: DuplicateDetectionOptions = {
+    projectRoot: context.projectRootFs,
+    files: context.files,
+    ...(minConfidence !== undefined ? { minConfidence } : {}),
+    limit: parseNonNegativeIntegerOption(context.getOpt("--limit"), "--limit", 50),
+    minTokens: parsePositiveIntegerOption(context.getOpt("--min-tokens"), "--min-tokens", 40),
+    maxTokens: parsePositiveIntegerOption(context.getOpt("--max-tokens"), "--max-tokens", 800),
+    maxBucketSize: parsePositiveIntegerOption(context.getOpt("--max-bucket-size"), "--max-bucket-size", 200),
+    ...(context.hasFlag("--include-same-file") ? { includeSameFile: true } : {}),
+    ...(context.hasFlag("--include-small") ? { includeSmall: true } : {}),
+  };
+
+  if (options.maxTokens !== undefined && options.minTokens !== undefined && options.maxTokens < options.minTokens) {
+    throw new Error(
+      `Invalid --max-tokens value "${options.maxTokens}". Expected a value greater than or equal to --min-tokens.`,
+    );
+  }
+  return options;
+}
+
+export async function handleDuplicatesCommand(context: DuplicatesCommandContext): Promise<void> {
+  try {
+    const index = await buildProjectIndexFromFiles(context.projectRootFs, context.files, context.indexOptions);
+    const result = await findDuplicates(index, parseDuplicateDetectionOptions(context));
+    context.writeJSONLine(result);
+  } catch (error) {
+    context.writeStderrLine(`Duplicate detection failed: ${error instanceof Error ? error.message : String(error)}`);
+    context.exit(1);
+  }
+}
diff --git a/src/cli/help.ts b/src/cli/help.ts
index 26190e3e..79807d83 100644
--- a/src/cli/help.ts
+++ b/src/cli/help.ts
@@ -19,6 +19,7 @@ Commands:
   path          Find the shortest dependency path between files
   cycles        Detect dependency cycles (use --sort priority|size|fanin)
   hotspots      Find high-complexity files
+  duplicates    Detect duplicate and near-duplicate code units
   unresolved    List unresolved project imports
   apisurface    Summarize exported API symbols
   grep          Run Tree-sitter query or text regex search
@@ -65,6 +66,7 @@ Examples:
   codegraph version
   codegraph doctor
   codegraph inspect ./src --limit 20
+  codegraph duplicates ./src --min-confidence medium
   codegraph search "auth user" --json
   codegraph explain src/auth.ts --json
   codegraph artifact build --root . --out codegraph-out --json
@@ -90,6 +92,7 @@ const knownCliCommands = new Set([
   "cycles",
   "deps",
   "doctor",
+  "duplicates",
   "dumpmod",
   "explain",
   "goto",
@@ -194,9 +197,31 @@ Defaults:
   Tools are read-only unless --allow-build is passed.
 `;
 
+export const DUPLICATES_HELP_TEXT = `codegraph duplicates - Detect duplicate and near-duplicate code units
+
+Usage: codegraph duplicates [path ...] [--root <path>] [--min-confidence high|medium|low] [--limit <n>] [--include-same-file] [--include-small]
+
+Path behavior:
+  A single positional directory becomes the project root when --root is omitted.
+  Use --root . ./src to scan a subtree while keeping repository-relative paths.
+
+Options:
+  --min-confidence  Minimum confidence to report. Defaults to medium.
+  --limit           Maximum suggestions to return. Defaults to 50.
+  --include-same-file Report non-overlapping clones in the same file.
+  --include-small   Include units below the default token floor.
+  --min-tokens      Minimum unit tokens. Defaults to 40.
+  --max-tokens      Maximum fallback chunk tokens. Defaults to 800.
+  --max-bucket-size Skip candidate buckets larger than this value. Defaults to 200.
+
+Output:
+  Always emits JSON with scored suggestions, confidence, clone type, metrics, omission counts, and pair stats.
+`;
+
 export function helpTextForCommand(command: string, positionals: readonly string[]): string | undefined {
   if (command === "search") return SEARCH_HELP_TEXT;
   if (command === "explain") return EXPLAIN_HELP_TEXT;
+  if (command === "duplicates") return DUPLICATES_HELP_TEXT;
   if (command === "artifact") return ARTIFACT_HELP_TEXT;
   if (command === "mcp") {
     return positionals[0] === "serve" ? MCP_SERVE_HELP_TEXT : MCP_HELP_TEXT;
diff --git a/src/cli/impact.ts b/src/cli/impact.ts
index b19a42da..897ece50 100644
--- a/src/cli/impact.ts
+++ b/src/cli/impact.ts
@@ -343,10 +343,12 @@ function applyAnalysisOptions(context: ImpactCommandContext, options: ImpactOpti
 function buildIndexOptions(context: ImpactCommandContext, options: ImpactOptionsBuilder): BuildOptions {
   const cacheMode =
     options.cache === "off" || options.cache === "memory" || options.cache === "disk" ? options.cache : undefined;
+  const keepParsed = options.refContext !== undefined;
   const indexOpts: BuildOptions = {
     threads: options.threads ?? 0,
     discovery: context.discoveryOptions,
     onProgress: context.progressHandler,
+    ...(keepParsed ? { keepParsed } : {}),
     ...(context.nativeMode !== "auto" ? { native: context.nativeMode } : {}),
     ...context.workerOpts,
     ...(cacheMode !== undefined ? { cache: cacheMode } : {}),
diff --git a/src/cli/options.ts b/src/cli/options.ts
index dee22c77..0497072b 100644
--- a/src/cli/options.ts
+++ b/src/cli/options.ts
@@ -40,6 +40,8 @@ const CLI_VALUE_OPTIONS = new Set<string>([
   "--language",
   "--min-tokens",
   "--max-tokens",
+  "--max-bucket-size",
+  "--min-confidence",
   "--max-hits",
   "--resolution-hint",
   "--review-depth",
diff --git a/src/duplicates.ts b/src/duplicates.ts
new file mode 100644
index 00000000..4223659c
--- /dev/null
+++ b/src/duplicates.ts
@@ -0,0 +1,864 @@
+import crypto from "node:crypto";
+import fsp from "node:fs/promises";
+import path from "node:path";
+import { LANG_CONFIGS } from "./bootstrap/treeSitterLanguages.js";
+import { chunkFile, type Chunk } from "./chunking/chunkFile.js";
+import { chunkTextFile } from "./chunking/chunkTextFile.js";
+import { supportForFile } from "./languages.js";
+import { SymbolKind, type ProjectIndex, type SymbolDef } from "./indexer/types.js";
+import { assertFilePathWithinRoot, normalizePath, toProjectDisplayPath } from "./util/paths.js";
+
+export type DuplicateConfidence = "high" | "medium" | "low";
+export type DuplicateCloneType = "exact" | "renamed" | "near" | "weak";
+export type DuplicateUnitKind = "symbol" | "chunk";
+
+export type DuplicateUnitRef = {
+  file: string;
+  startLine: number;
+  endLine: number;
+  languageId: string;
+  kind: DuplicateUnitKind;
+  tokenCount: number;
+  name?: string;
+  symbolKind?: SymbolKind;
+  complexity?: number;
+};
+
+export type DuplicateMetrics = {
+  tokenJaccard: number;
+  shingleOverlap: number;
+  lengthRatio: number;
+  lineSpanRatio: number;
+  complexityDelta?: number;
+};
+
+export type DuplicateSuggestion = {
+  score: number;
+  confidence: DuplicateConfidence;
+  cloneType: DuplicateCloneType;
+  left: DuplicateUnitRef;
+  right: DuplicateUnitRef;
+  metrics: DuplicateMetrics;
+  reasons: string[];
+};
+
+export type DuplicateDetectionOptions = {
+  projectRoot?: string;
+  files?: readonly string[];
+  minConfidence?: DuplicateConfidence;
+  limit?: number;
+  crossFileOnly?: boolean;
+  includeSameFile?: boolean;
+  includeSmall?: boolean;
+  minTokens?: number;
+  maxTokens?: number;
+  maxBucketSize?: number;
+  shingleSize?: number;
+  windowSize?: number;
+};
+
+export type DuplicateDetectionOmittedCounts = {
+  suggestions: number;
+  oversizedBuckets: number;
+  belowThresholdUnits: number;
+  overlappingPairs: number;
+};
+
+export type DuplicateDetectionStats = {
+  comparedPairs: number;
+  candidatePairs: number;
+};
+
+export type DuplicateDetectionResult = {
+  schemaVersion: 1;
+  units: number;
+  suggestions: DuplicateSuggestion[];
+  omittedCounts: DuplicateDetectionOmittedCounts;
+  stats: DuplicateDetectionStats;
+};
+
+type DuplicateInternalUnit = DuplicateUnitRef & {
+  id: string;
+  absoluteFile: string;
+  text: string;
+  rawHash: string;
+  normalizedHash: string;
+  normalizedTokens: string[];
+  tokenSet: Set<string>;
+  signatures: Set<string>;
+};
+
+type DuplicateUnitDraft = Omit<DuplicateUnitRef, "tokenCount">;
+
+type PairEvidence = {
+  left: DuplicateInternalUnit;
+  right: DuplicateInternalUnit;
+  rawHash: boolean;
+  normalizedHash: boolean;
+  signature: boolean;
+  signatureMatches: number;
+};
+
+type LanguageForFileResult = {
+  id: string;
+  textOnly: boolean;
+};
+
+type ConsideredSignaturesByUnit = Map<string, Set<string>>;
+
+const DEFAULT_MIN_TOKENS = 40;
+const DEFAULT_MAX_TOKENS = 800;
+const DEFAULT_LIMIT = 50;
+const DEFAULT_MAX_BUCKET_SIZE = 200;
+const DEFAULT_SHINGLE_SIZE = 5;
+const DEFAULT_WINDOW_SIZE = 4;
+const DEFAULT_MAX_FINGERPRINTS = 128;
+
+const textLanguageByExtension: Record<string, string> = {
+  ".json": "json",
+  ".jsonc": "jsonc",
+  ".lock": "text",
+  ".toml": "toml",
+  ".txt": "text",
+  ".yaml": "yaml",
+  ".yml": "yaml",
+};
+
+const chunkLanguageAliases: Record<string, string> = {
+  js: "javascript",
+  ts: "typescript",
+};
+
+const confidenceRank: Record<DuplicateConfidence, number> = {
+  low: 1,
+  medium: 2,
+  high: 3,
+};
+
+const symbolUnitKinds = new Set<SymbolKind>([
+  SymbolKind.Function,
+  SymbolKind.Class,
+  SymbolKind.Interface,
+  SymbolKind.TypeAlias,
+  SymbolKind.Routine,
+  SymbolKind.Table,
+  SymbolKind.View,
+]);
+
+const identifierKeywords = new Set([
+  "abstract",
+  "and",
+  "as",
+  "async",
+  "await",
+  "break",
+  "case",
+  "catch",
+  "class",
+  "const",
+  "continue",
+  "def",
+  "default",
+  "defer",
+  "delete",
+  "do",
+  "else",
+  "enum",
+  "export",
+  "extends",
+  "false",
+  "final",
+  "finally",
+  "fn",
+  "for",
+  "from",
+  "func",
+  "function",
+  "if",
+  "implements",
+  "import",
+  "in",
+  "interface",
+  "is",
+  "lambda",
+  "let",
+  "match",
+  "module",
+  "namespace",
+  "new",
+  "nil",
+  "none",
+  "not",
+  "null",
+  "or",
+  "package",
+  "private",
+  "protected",
+  "public",
+  "return",
+  "self",
+  "static",
+  "struct",
+  "switch",
+  "this",
+  "throw",
+  "throws",
+  "trait",
+  "true",
+  "try",
+  "type",
+  "use",
+  "using",
+  "var",
+  "void",
+  "when",
+  "where",
+  "while",
+]);
+
+function hashText(value: string): string {
+  return crypto.createHash("sha256").update(value).digest("hex");
+}
+
+function shortHashText(value: string): string {
+  return hashText(value).slice(0, 16);
+}
+
+function clampScore(score: number): number {
+  return Math.max(0, Math.min(100, Math.round(score)));
+}
+
+/** Splits source into names, literals, operators, and punctuation. */
+function tokenizeSource(text: string): string[] {
+  return (
+    text.match(/"(?:\\.|[^"\\])*"|'(?:\\.|[^'\\])*'|`(?:\\.|[^`\\])*`|\b[A-Za-z_$][\w$]*\b|\d+(?:\.\d+)?|[^\s]/g) ??
+    []
+  );
+}
+
+function countDuplicateTokens(text: string): number {
+  return tokenizeSource(text).length;
+}
+
+/** Replaces names and literals while preserving syntax and keywords. */
+function normalizeToken(token: string): string {
+  if (/^["'`]/.test(token)) return "<literal>";
+  if (/^\d/.test(token)) return "<literal>";
+  if (/^[A-Za-z_$][\w$]*$/.test(token)) {
+    const lower = token.toLowerCase();
+    if (identifierKeywords.has(lower)) return lower;
+    return "<identifier>";
+  }
+  return token;
+}
+
+/** Builds hashed token windows used as local structural fingerprints. */
+function makeShingles(tokens: readonly string[], size: number): string[] {
+  if (tokens.length < size) return [];
+  const shingles: string[] = [];
+  for (let i = 0; i <= tokens.length - size; i++) {
+    shingles.push(shortHashText(tokens.slice(i, i + size).join("\u0000")));
+  }
+  return shingles;
+}
+
+/** Keeps stable representative fingerprints from nearby shingle windows. */
+function winnowShingles(shingles: readonly string[], windowSize: number, maxFingerprints: number): Set<string> {
+  if (!shingles.length) return new Set();
+  if (shingles.length <= windowSize) return new Set(shingles.slice(0, maxFingerprints));
+
+  const fingerprints = new Set<string>();
+  for (let i = 0; i <= shingles.length - windowSize; i++) {
+    let minimum = shingles[i]!;
+    for (let j = i + 1; j < i + windowSize; j++) {
+      const candidate = shingles[j]!;
+      if (candidate < minimum) minimum = candidate;
+    }
+    fingerprints.add(minimum);
+    if (fingerprints.size >= maxFingerprints) break;
+  }
+  return fingerprints;
+}
+
+function lineSpan(unit: Pick<DuplicateUnitRef, "startLine" | "endLine">): number {
+  return Math.max(1, unit.endLine - unit.startLine + 1);
+}
+
+function ratio(left: number, right: number): number {
+  if (!left || !right) return 0;
+  return Math.min(left, right) / Math.max(left, right);
+}
+
+/** Measures set similarity as intersection divided by union. */
+function jaccard(left: Set<string>, right: Set<string>): number {
+  if (!left.size && !right.size) return 1;
+  let intersection = 0;
+  for (const value of left) {
+    if (right.has(value)) intersection++;
+  }
+  const union = left.size + right.size - intersection;
+  if (!union) return 0;
+  return intersection / union;
+}
+
+function normalizeConfidence(value: DuplicateConfidence | undefined): DuplicateConfidence {
+  return value ?? "medium";
+}
+
+function normalizePositiveIntegerOption(value: number | undefined, optionName: string, fallback: number): number {
+  const resolved = value ?? fallback;
+  if (!Number.isInteger(resolved) || resolved < 1) {
+    throw new Error(`Invalid ${optionName} value "${String(resolved)}". Expected a positive integer.`);
+  }
+  return resolved;
+}
+
+function normalizeNonNegativeIntegerOption(value: number | undefined, optionName: string, fallback: number): number {
+  const resolved = value ?? fallback;
+  if (!Number.isInteger(resolved) || resolved < 0) {
+    throw new Error(`Invalid ${optionName} value "${String(resolved)}". Expected a non-negative integer.`);
+  }
+  return resolved;
+}
+
+function confidenceForScore(score: number): DuplicateConfidence {
+  if (score >= 85) return "high";
+  if (score >= 65) return "medium";
+  return "low";
+}
+
+function cloneTypeForPair(evidence: PairEvidence, metrics: DuplicateMetrics): DuplicateCloneType {
+  if (evidence.rawHash) return "exact";
+  if (evidence.normalizedHash && metrics.tokenJaccard >= 0.75) return "renamed";
+  if (metrics.shingleOverlap >= 0.55 || metrics.tokenJaccard >= 0.72) return "near";
+  return "weak";
+}
+
+function languageForFile(filePath: string): LanguageForFileResult | undefined {
+  const support = supportForFile(filePath);
+  if (support) {
+    return { id: support.id, textOnly: false };
+  }
+  const languageId = textLanguageByExtension[path.extname(filePath).toLowerCase()];
+  if (languageId) {
+    return { id: languageId, textOnly: true };
+  }
+  return undefined;
+}
+
+function displayPath(projectRoot: string | undefined, filePath: string): string {
+  if (!projectRoot) return normalizePath(filePath);
+  return toProjectDisplayPath(projectRoot, filePath) || normalizePath(filePath);
+}
+
+function normalizeDetectionFile(filePath: string, projectRoot: string | undefined): string {
+  if (!projectRoot) return normalizePath(filePath);
+  return assertFilePathWithinRoot(projectRoot, filePath, "Duplicate input file");
+}
+
+function internalUnitId(unit: DuplicateUnitDraft, absoluteFile: string): string {
+  return `${normalizePath(absoluteFile)}:${unit.startLine}:${unit.endLine}:${unit.kind}:${unit.name ?? ""}`;
+}
+
+/** Adds hashes, normalized tokens, and fingerprints to a reportable unit. */
+function buildInternalUnit(
+  unit: DuplicateUnitDraft,
+  absoluteFile: string,
+  text: string,
+  shingleSize: number,
+  windowSize: number,
+): DuplicateInternalUnit {
+  const rawHash = hashText(text);
+  const sourceTokens = tokenizeSource(text);
+  const normalizedTokens = sourceTokens.map(normalizeToken);
+  const signatures = winnowShingles(
+    makeShingles(normalizedTokens, shingleSize),
+    windowSize,
+    DEFAULT_MAX_FINGERPRINTS,
+  );
+  return {
+    ...unit,
+    id: internalUnitId(unit, absoluteFile),
+    absoluteFile: normalizePath(absoluteFile),
+    text,
+    rawHash,
+    normalizedHash: hashText(normalizedTokens.join(" ")),
+    tokenCount: sourceTokens.length,
+    normalizedTokens,
+    tokenSet: new Set(normalizedTokens),
+    signatures,
+  };
+}
+
+function makeSymbolUnit(
+  symbol: SymbolDef,
+  chunk: Chunk,
+  projectRoot: string | undefined,
+  shingleSize: number,
+  windowSize: number,
+): DuplicateInternalUnit | undefined {
+  if (!symbolUnitKinds.has(symbol.kind)) return undefined;
+  const unit: DuplicateUnitDraft = {
+    file: displayPath(projectRoot, symbol.file),
+    startLine: chunk.startLine,
+    endLine: chunk.endLine,
+    languageId: chunk.languageId,
+    kind: "symbol",
+    name: symbol.localName,
+    symbolKind: symbol.kind,
+    ...(symbol.complexity !== undefined ? { complexity: symbol.complexity } : {}),
+  };
+  return buildInternalUnit(unit, symbol.file, chunk.text, shingleSize, windowSize);
+}
+
+function makeDuplicateChunks(
+  filePath: string,
+  languageId: string,
+  textOnly: boolean,
+  source: string,
+  minTokens: number,
+  maxTokens: number,
+): Chunk[] {
+  const langConfig = LANG_CONFIGS[chunkLanguageAliases[languageId] ?? languageId];
+  if (langConfig && !textOnly) {
+    return chunkFile({ language: langConfig, source, filePath, minTokens, maxTokens, tokenizer: countDuplicateTokens });
+  }
+  return chunkTextFile({ source, filePath, languageId, minTokens, maxTokens, tokenizer: countDuplicateTokens });
+}
+
+function makeSymbolSourceChunks(
+  filePath: string,
+  languageId: string,
+  textOnly: boolean,
+  source: string,
+  maxTokens: number,
+): Chunk[] {
+  if (textOnly) return [];
+  return makeDuplicateChunks(filePath, languageId, false, source, 1, maxTokens);
+}
+
+function findChunkForSymbol(symbol: SymbolDef, chunks: readonly Chunk[]): Chunk | undefined {
+  const symbolLine = Math.max(1, symbol.range.start.line);
+  const candidates = chunks.filter(
+    (chunk) =>
+      chunk.name === symbol.localName &&
+      chunk.startLine <= symbolLine &&
+      symbolLine <= chunk.endLine &&
+      chunk.endLine > chunk.startLine,
+  );
+  candidates.sort((left, right) => lineSpan(left) - lineSpan(right));
+  return candidates[0];
+}
+
+/** Falls back to semantic chunks so body-level clones are still visible. */
+function makeChunkUnits(
+  filePath: string,
+  chunks: readonly Chunk[],
+  projectRoot: string | undefined,
+  shingleSize: number,
+  windowSize: number,
+): DuplicateInternalUnit[] {
+  return chunks.map((chunk) => {
+    const unit: DuplicateUnitDraft = {
+      file: displayPath(projectRoot, filePath),
+      startLine: chunk.startLine,
+      endLine: chunk.endLine,
+      languageId: chunk.languageId,
+      kind: "chunk",
+      ...(chunk.name !== undefined ? { name: chunk.name } : {}),
+    };
+    return buildInternalUnit(unit, filePath, chunk.text, shingleSize, windowSize);
+  });
+}
+
+function shouldKeepUnit(unit: DuplicateInternalUnit, includeSmall: boolean, minTokens: number): boolean {
+  if (includeSmall) return true;
+  return unit.tokenCount >= minTokens;
+}
+
+function pairKey(left: DuplicateInternalUnit, right: DuplicateInternalUnit): string {
+  if (left.id < right.id) return `${left.id}\u0000${right.id}`;
+  return `${right.id}\u0000${left.id}`;
+}
+
+function orderedPair(left: DuplicateInternalUnit, right: DuplicateInternalUnit): [DuplicateInternalUnit, DuplicateInternalUnit] {
+  if (left.absoluteFile < right.absoluteFile) return [left, right];
+  if (left.absoluteFile > right.absoluteFile) return [right, left];
+  if (left.startLine <= right.startLine) return [left, right];
+  return [right, left];
+}
+
+function hasLineOverlap(left: DuplicateInternalUnit, right: DuplicateInternalUnit): boolean {
+  if (left.absoluteFile !== right.absoluteFile) return false;
+  return left.startLine <= right.endLine && right.startLine <= left.endLine;
+}
+
+/** Adds every unique pair from one shared-evidence bucket. */
+function addBucketPairs(
+  bucket: readonly DuplicateInternalUnit[],
+  pairs: Map<string, PairEvidence>,
+  evidenceKind: "rawHash" | "normalizedHash" | "signature",
+): void {
+  for (let i = 0; i < bucket.length; i++) {
+    for (let j = i + 1; j < bucket.length; j++) {
+      const [left, right] = orderedPair(bucket[i]!, bucket[j]!);
+      const key = pairKey(left, right);
+      const existing = pairs.get(key);
+      if (existing) {
+        if (evidenceKind === "signature") {
+          existing.signatureMatches++;
+        } else {
+          existing[evidenceKind] = true;
+        }
+        continue;
+      }
+      pairs.set(key, {
+        left,
+        right,
+        rawHash: evidenceKind === "rawHash",
+        normalizedHash: evidenceKind === "normalizedHash",
+        signature: false,
+        signatureMatches: evidenceKind === "signature" ? 1 : 0,
+      });
+    }
+  }
+}
+
+/** Adds bounded buckets and counts skipped high-fanout buckets. */
+function addBucketsToPairs(
+  buckets: Map<string, DuplicateInternalUnit[]>,
+  pairs: Map<string, PairEvidence>,
+  evidenceKind: "rawHash" | "normalizedHash" | "signature",
+  maxBucketSize: number,
+): number {
+  let oversizedBuckets = 0;
+  for (const bucket of buckets.values()) {
+    if (bucket.length < 2) continue;
+    if (bucket.length > maxBucketSize) {
+      oversizedBuckets++;
+      continue;
+    }
+    addBucketPairs(bucket, pairs, evidenceKind);
+  }
+  return oversizedBuckets;
+}
+
+function addConsideredSignature(
+  consideredSignaturesByUnit: ConsideredSignaturesByUnit,
+  unit: DuplicateInternalUnit,
+  signature: string,
+): void {
+  const signatures = consideredSignaturesByUnit.get(unit.id);
+  if (signatures) {
+    signatures.add(signature);
+    return;
+  }
+  consideredSignaturesByUnit.set(unit.id, new Set([signature]));
+}
+
+function addSignatureBucketsToPairs(
+  buckets: Map<string, DuplicateInternalUnit[]>,
+  pairs: Map<string, PairEvidence>,
+  consideredSignaturesByUnit: ConsideredSignaturesByUnit,
+  maxBucketSize: number,
+): number {
+  let oversizedBuckets = 0;
+  for (const [signature, bucket] of buckets) {
+    if (bucket.length < 2) continue;
+    if (bucket.length > maxBucketSize) {
+      oversizedBuckets++;
+      continue;
+    }
+    for (const unit of bucket) {
+      addConsideredSignature(consideredSignaturesByUnit, unit, signature);
+    }
+    addBucketPairs(bucket, pairs, "signature");
+  }
+  return oversizedBuckets;
+}
+
+/** Combines exact, normalized, fingerprint, size, and complexity signals. */
+function scorePair(evidence: PairEvidence, metrics: DuplicateMetrics): { score: number; reasons: string[] } {
+  let score = 0;
+  const reasons: string[] = [];
+
+  if (evidence.rawHash) {
+    score += 68;
+    reasons.push("identical text");
+  }
+  if (evidence.normalizedHash) {
+    score += 48;
+    reasons.push("matching normalized token stream");
+  }
+  if (evidence.signature) {
+    score += 14;
+    reasons.push("shared fingerprint bucket");
+  }
+
+  score += metrics.tokenJaccard * 24;
+  score += metrics.shingleOverlap * 26;
+  score += metrics.lengthRatio * 8;
+  score += metrics.lineSpanRatio * 5;
+
+  if (evidence.left.symbolKind !== undefined && evidence.left.symbolKind === evidence.right.symbolKind) {
+    score += 4;
+    reasons.push(`matching ${evidence.left.symbolKind} units`);
+  }
+  if (metrics.complexityDelta !== undefined && metrics.complexityDelta <= 2) {
+    score += 3;
+    reasons.push("similar complexity");
+  }
+  if (metrics.lengthRatio < 0.45) score -= 18;
+  if (evidence.left.absoluteFile === evidence.right.absoluteFile) score -= 8;
+
+  return { score: clampScore(score), reasons };
+}
+
+function metricsForPair(evidence: PairEvidence): DuplicateMetrics {
+  const left = evidence.left;
+  const right = evidence.right;
+  const metrics: DuplicateMetrics = {
+    tokenJaccard: jaccard(left.tokenSet, right.tokenSet),
+    shingleOverlap: jaccard(left.signatures, right.signatures),
+    lengthRatio: ratio(left.tokenCount, right.tokenCount),
+    lineSpanRatio: ratio(lineSpan(left), lineSpan(right)),
+  };
+  if (left.complexity !== undefined && right.complexity !== undefined) {
+    metrics.complexityDelta = Math.abs(left.complexity - right.complexity);
+  }
+  return metrics;
+}
+
+/** Reads files and creates comparable symbol and chunk units. */
+async function collectDuplicateUnits(
+  index: ProjectIndex,
+  options: Required<
+    Pick<DuplicateDetectionOptions, "includeSmall" | "minTokens" | "maxTokens" | "shingleSize" | "windowSize">
+  > & { projectRoot: string | undefined; files: readonly string[] | undefined },
+): Promise<{ units: DuplicateInternalUnit[]; belowThresholdUnits: number }> {
+  const files = options.files ?? Array.from(index.byFile.keys());
+  const normalizedFiles = Array.from(new Set(files.map((file) => normalizeDetectionFile(file, options.projectRoot)))).sort();
+  const units: DuplicateInternalUnit[] = [];
+  let belowThresholdUnits = 0;
+
+  for (const file of normalizedFiles) {
+    const moduleIndex = index.byFile.get(file);
+    const language = languageForFile(file);
+    if (!language) continue;
+
+    let source: string;
+    try {
+      source = await fsp.readFile(file, "utf8");
+    } catch {
+      continue;
+    }
+
+    const chunks = makeDuplicateChunks(
+      file,
+      language.id,
+      language.textOnly,
+      source,
+      options.minTokens,
+      options.maxTokens,
+    );
+    const symbolChunks = makeSymbolSourceChunks(file, language.id, language.textOnly, source, options.maxTokens);
+    const symbolUnits = (moduleIndex?.locals ?? [])
+      .map((symbol) => {
+        const chunk = findChunkForSymbol(symbol, symbolChunks);
+        if (!chunk) return undefined;
+        return makeSymbolUnit(symbol, chunk, options.projectRoot, options.shingleSize, options.windowSize);
+      })
+      .filter((unit): unit is DuplicateInternalUnit => unit !== undefined);
+    const chunkUnits = makeChunkUnits(
+      file,
+      chunks,
+      options.projectRoot,
+      options.shingleSize,
+      options.windowSize,
+    );
+    const candidates = [...symbolUnits, ...chunkUnits];
+
+    for (const unit of candidates) {
+      if (!shouldKeepUnit(unit, options.includeSmall, options.minTokens)) {
+        belowThresholdUnits++;
+        continue;
+      }
+      units.push(unit);
+    }
+  }
+
+  units.sort((left, right) => {
+    const fileCompare = left.absoluteFile.localeCompare(right.absoluteFile);
+    if (fileCompare) return fileCompare;
+    const lineCompare = left.startLine - right.startLine;
+    if (lineCompare) return lineCompare;
+    return (left.name ?? "").localeCompare(right.name ?? "");
+  });
+  return { units, belowThresholdUnits };
+}
+
+function addToBucket(buckets: Map<string, DuplicateInternalUnit[]>, key: string, unit: DuplicateInternalUnit): void {
+  const bucket = buckets.get(key);
+  if (bucket) {
+    bucket.push(unit);
+    return;
+  }
+  buckets.set(key, [unit]);
+}
+
+/** Groups units by cheap fingerprints before expensive pair scoring. */
+function buildCandidatePairs(
+  units: readonly DuplicateInternalUnit[],
+  maxBucketSize: number,
+): { pairs: Map<string, PairEvidence>; oversizedBuckets: number } {
+  const rawHashBuckets = new Map<string, DuplicateInternalUnit[]>();
+  const normalizedHashBuckets = new Map<string, DuplicateInternalUnit[]>();
+  const signatureBuckets = new Map<string, DuplicateInternalUnit[]>();
+
+  for (const unit of units) {
+    const languagePrefix = `${unit.languageId}:`;
+    addToBucket(rawHashBuckets, `${languagePrefix}${unit.rawHash}`, unit);
+    addToBucket(normalizedHashBuckets, `${languagePrefix}${unit.normalizedHash}`, unit);
+    for (const signature of unit.signatures) {
+      addToBucket(signatureBuckets, `${languagePrefix}${signature}`, unit);
+    }
+  }
+
+  const pairs = new Map<string, PairEvidence>();
+  const consideredSignaturesByUnit: ConsideredSignaturesByUnit = new Map();
+  let oversizedBuckets = 0;
+  oversizedBuckets += addBucketsToPairs(rawHashBuckets, pairs, "rawHash", maxBucketSize);
+  oversizedBuckets += addBucketsToPairs(normalizedHashBuckets, pairs, "normalizedHash", maxBucketSize);
+  oversizedBuckets += addSignatureBucketsToPairs(signatureBuckets, pairs, consideredSignaturesByUnit, maxBucketSize);
+  for (const [key, evidence] of pairs) {
+    if (hasEnoughSharedFingerprints(evidence, consideredSignaturesByUnit)) {
+      evidence.signature = true;
+      continue;
+    }
+    if (!evidence.rawHash && !evidence.normalizedHash) {
+      pairs.delete(key);
+    }
+  }
+  return { pairs, oversizedBuckets };
+}
+
+/** Requires enough shared fingerprints to avoid incidental syntax matches. */
+function hasEnoughSharedFingerprints(
+  evidence: PairEvidence,
+  consideredSignaturesByUnit: ConsideredSignaturesByUnit,
+): boolean {
+  if (!evidence.signatureMatches) return false;
+  const leftConsideredSignatures = consideredSignaturesByUnit.get(evidence.left.id)?.size ?? 0;
+  const rightConsideredSignatures = consideredSignaturesByUnit.get(evidence.right.id)?.size ?? 0;
+  const smallerConsideredSignatureCount = Math.min(leftConsideredSignatures, rightConsideredSignatures);
+  if (!smallerConsideredSignatureCount) return false;
+  const minimumShared = Math.max(2, Math.ceil(smallerConsideredSignatureCount * 0.25));
+  return evidence.signatureMatches >= minimumShared;
+}
+
+function suggestionForPair(evidence: PairEvidence): DuplicateSuggestion {
+  const metrics = metricsForPair(evidence);
+  const { score, reasons } = scorePair(evidence, metrics);
+  return {
+    score,
+    confidence: confidenceForScore(score),
+    cloneType: cloneTypeForPair(evidence, metrics),
+    left: unitRef(evidence.left),
+    right: unitRef(evidence.right),
+    metrics,
+    reasons,
+  };
+}
+
+function unitRef(unit: DuplicateInternalUnit): DuplicateUnitRef {
+  return {
+    file: unit.file,
+    startLine: unit.startLine,
+    endLine: unit.endLine,
+    languageId: unit.languageId,
+    kind: unit.kind,
+    tokenCount: unit.tokenCount,
+    ...(unit.name !== undefined ? { name: unit.name } : {}),
+    ...(unit.symbolKind !== undefined ? { symbolKind: unit.symbolKind } : {}),
+    ...(unit.complexity !== undefined ? { complexity: unit.complexity } : {}),
+  };
+}
+
+/** Finds scored duplicate candidates from an already-built project index. */
+export async function findDuplicates(
+  index: ProjectIndex,
+  options: DuplicateDetectionOptions = {},
+): Promise<DuplicateDetectionResult> {
+  const projectRoot = options.projectRoot ?? index.projectRoot;
+  const minTokens = normalizePositiveIntegerOption(options.minTokens, "minTokens", DEFAULT_MIN_TOKENS);
+  const maxTokens = normalizePositiveIntegerOption(options.maxTokens, "maxTokens", DEFAULT_MAX_TOKENS);
+  const maxBucketSize = normalizePositiveIntegerOption(
+    options.maxBucketSize,
+    "maxBucketSize",
+    DEFAULT_MAX_BUCKET_SIZE,
+  );
+  const shingleSize = normalizePositiveIntegerOption(options.shingleSize, "shingleSize", DEFAULT_SHINGLE_SIZE);
+  const windowSize = normalizePositiveIntegerOption(options.windowSize, "windowSize", DEFAULT_WINDOW_SIZE);
+  const includeSmall = options.includeSmall ?? false;
+  const crossFileOnly = options.crossFileOnly ?? !(options.includeSameFile ?? false);
+  const minConfidence = normalizeConfidence(options.minConfidence);
+  const limit = normalizeNonNegativeIntegerOption(options.limit, "limit", DEFAULT_LIMIT);
+
+  if (maxTokens < minTokens) {
+    throw new Error(`Invalid maxTokens value "${maxTokens}". Expected a value greater than or equal to minTokens.`);
+  }
+
+  const { units, belowThresholdUnits } = await collectDuplicateUnits(index, {
+    projectRoot,
+    files: options.files,
+    includeSmall,
+    minTokens,
+    maxTokens,
+    shingleSize,
+    windowSize,
+  });
+  const { pairs, oversizedBuckets } = buildCandidatePairs(units, maxBucketSize);
+  const suggestions: DuplicateSuggestion[] = [];
+  let overlappingPairs = 0;
+  let comparedPairs = 0;
+
+  for (const evidence of pairs.values()) {
+    if (crossFileOnly && evidence.left.absoluteFile === evidence.right.absoluteFile) continue;
+    if (hasLineOverlap(evidence.left, evidence.right)) {
+      overlappingPairs++;
+      continue;
+    }
+
+    comparedPairs++;
+    const suggestion = suggestionForPair(evidence);
+    if (confidenceRank[suggestion.confidence] < confidenceRank[minConfidence]) continue;
+    suggestions.push(suggestion);
+  }
+
+  suggestions.sort((left, right) => {
+    const scoreCompare = right.score - left.score;
+    if (scoreCompare) return scoreCompare;
+    const leftFileCompare = left.left.file.localeCompare(right.left.file);
+    if (leftFileCompare) return leftFileCompare;
+    const rightFileCompare = left.right.file.localeCompare(right.right.file);
+    if (rightFileCompare) return rightFileCompare;
+    return left.left.startLine - right.left.startLine;
+  });
+
+  const limitedSuggestions = suggestions.slice(0, limit);
+  return {
+    schemaVersion: 1,
+    units: units.length,
+    suggestions: limitedSuggestions,
+    omittedCounts: {
+      suggestions: Math.max(0, suggestions.length - limitedSuggestions.length),
+      oversizedBuckets,
+      belowThresholdUnits,
+      overlappingPairs,
+    },
+    stats: {
+      comparedPairs,
+      candidatePairs: pairs.size,
+    },
+  };
+}
diff --git a/src/index.ts b/src/index.ts
index d041bd3f..11794243 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -316,6 +316,21 @@ export { chunkFile, type Chunk, type ChunkFileOptions } from "./chunking/chunkFi
 export { chunkTextFile, type TextChunkOptions } from "./chunking/chunkTextFile.js";
 export { chunkSFCFile, type ChunkSFCOptions } from "./chunking/chunkSFC.js";
 
+/** In-memory duplicate and near-duplicate code detection. */
+export {
+  findDuplicates,
+  type DuplicateCloneType,
+  type DuplicateConfidence,
+  type DuplicateDetectionOmittedCounts,
+  type DuplicateDetectionOptions,
+  type DuplicateDetectionResult,
+  type DuplicateDetectionStats,
+  type DuplicateMetrics,
+  type DuplicateSuggestion,
+  type DuplicateUnitKind,
+  type DuplicateUnitRef,
+} from "./duplicates.js";
+
 /** Tree-sitter language configuration registry. */
 export { LANG_CONFIGS, type LanguageConfig } from "./bootstrap/treeSitterLanguages.js";
 
diff --git a/tests/cli-command-modules.test.ts b/tests/cli-command-modules.test.ts
index 801d813a..651519ce 100644
--- a/tests/cli-command-modules.test.ts
+++ b/tests/cli-command-modules.test.ts
@@ -2,17 +2,20 @@ import fs from "node:fs";
 import fsp from "node:fs/promises";
 import os from "node:os";
 import path from "node:path";
-import { describe, expect, test } from "vitest";
+import { describe, expect, test, vi } from "vitest";
 import { handleChunkCommand, type ChunkCommandContext } from "../src/cli/chunk.js";
 import { buildDoctorReport } from "../src/cli/doctor.js";
 import { handleGraphDeltaCommand } from "../src/cli/graphDelta.js";
 import { handleGraphQueryCommand, type GraphQueryCommandContext } from "../src/cli/graphQueries.js";
 import { CLI_HELP_TEXT, MCP_SERVE_HELP_TEXT } from "../src/cli/help.js";
+import { handleImpactCommand, type ImpactCommandContext } from "../src/cli/impact.js";
 import { getCodegraphPackageIdentity, getCodegraphVersion } from "../src/cli/packageInfo.js";
 import { handleSkillCommand, type SkillCommandContext } from "../src/cli/skill.js";
 import { handleSqlCommand } from "../src/cli/sql.js";
 import { runCli } from "../src/cli.js";
+import * as indexerBuild from "../src/indexer/build-index.js";
 import type { ProjectIndex } from "../src/indexer.js";
+import type { BuildOptions } from "../src/indexer/types.js";
 import type { Graph } from "../src/types.js";
 
 function readJsonRecord(value: unknown): Record<string, unknown> {
@@ -96,6 +99,35 @@ function createSkillContext(overrides: Partial<SkillCommandContext>): SkillComma
   };
 }
 
+function createImpactContext(overrides: Partial<ImpactCommandContext>): ImpactCommandContext {
+  const projectRoot = path.join(os.tmpdir(), "codegraph-impact-context").replace(/\\/g, "/");
+  return {
+    projectRootFs: projectRoot,
+    discoveryOptions: {},
+    getOpt: (name) => (name === "--provider" ? "raw" : undefined),
+    hasFlag: () => false,
+    parsedOptions: new Map(),
+    nativeMode: "auto",
+    workerOpts: {},
+    graphOptions: undefined,
+    progressHandler: undefined,
+    readStdin: async () => "",
+    writeJSONLine: () => {
+      throw new Error("unexpected json output");
+    },
+    writeStdoutLine: () => {
+      throw new Error("unexpected stdout");
+    },
+    writeStderrLine: () => {
+      throw new Error("unexpected stderr");
+    },
+    exit: (code) => {
+      throw new Error(`impact exit ${code}`);
+    },
+    ...overrides,
+  };
+}
+
 async function captureCli(
   args: string[],
   cwd = process.cwd(),
@@ -481,6 +513,56 @@ describe("CLI command modules", () => {
     ).rejects.toThrow('Invalid --cache value "banana". Expected one of: off, memory, disk.');
   });
 
+  test("impact command retains parsed cache only when reference context is requested", async () => {
+    const tempDir = await fsp.mkdtemp(path.join(os.tmpdir(), "codegraph-impact-module-"));
+    const sourcePath = path.join(tempDir, "feature.ts");
+    const diffText = [
+      "diff --git a/feature.ts b/feature.ts",
+      "index 1111111..2222222 100644",
+      "--- a/feature.ts",
+      "+++ b/feature.ts",
+      "@@ -1,3 +1,3 @@",
+      " export function feature() {",
+      "-  return 1;",
+      "+  return 2;",
+      " }",
+      "",
+    ].join("\n");
+    const capturedIndexOptions: BuildOptions[] = [];
+    const originalBuildProjectIndex = indexerBuild.buildProjectIndex;
+    const buildSpy = vi.spyOn(indexerBuild, "buildProjectIndex").mockImplementation(async (projectRoot, opts) => {
+      if (opts) capturedIndexOptions.push(opts);
+      return await originalBuildProjectIndex(projectRoot, opts);
+    });
+
+    try {
+      await fsp.writeFile(sourcePath, "export function feature() {\n  return 2;\n}\n", "utf8");
+      const baseContext = {
+        projectRootFs: tempDir,
+        readStdin: async () => diffText,
+        writeJSONLine: () => undefined,
+      } satisfies Partial<ImpactCommandContext>;
+
+      await handleImpactCommand(createImpactContext(baseContext));
+      await handleImpactCommand(
+        createImpactContext({
+          ...baseContext,
+          getOpt: (name) => {
+            if (name === "--provider") return "raw";
+            if (name === "--ref-context") return "line";
+            return undefined;
+          },
+        }),
+      );
+
+      expect(capturedIndexOptions[0]?.keepParsed).toBeUndefined();
+      expect(capturedIndexOptions[1]?.keepParsed).toBe(true);
+    } finally {
+      buildSpy.mockRestore();
+      await fsp.rm(tempDir, { recursive: true, force: true });
+    }
+  });
+
   test("chunks files through the extracted chunk command handler", async () => {
     const tempDir = await fsp.mkdtemp(path.join(os.tmpdir(), "codegraph-chunk-module-"));
     const filePath = path.join(tempDir, "sample.ts");
diff --git a/tests/duplicates.test.ts b/tests/duplicates.test.ts
new file mode 100644
index 00000000..831128b3
--- /dev/null
+++ b/tests/duplicates.test.ts
@@ -0,0 +1,504 @@
+import fsp from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { afterEach, describe, expect, test } from "vitest";
+import { runCli } from "../src/cli.js";
+import { buildProjectIndex, findDuplicates } from "../src/index.js";
+
+const tempRoots: string[] = [];
+
+async function makeTempProject(): Promise<string> {
+  const root = await fsp.mkdtemp(path.join(os.tmpdir(), "codegraph-duplicates-"));
+  tempRoots.push(root);
+  return root.replace(/\\/g, "/");
+}
+
+async function writeProjectFile(root: string, relativePath: string, source: string): Promise<string> {
+  const filePath = path.join(root, relativePath).replace(/\\/g, "/");
+  await fsp.mkdir(path.dirname(filePath), { recursive: true });
+  await fsp.writeFile(filePath, source);
+  return filePath;
+}
+
+async function captureCli(
+  args: string[],
+  cwd: string,
+): Promise<{ stdout: string; stderr: string; exitCode: number | undefined }> {
+  let stdout = "";
+  let stderr = "";
+  let exitCode: number | undefined;
+
+  await runCli(args, {
+    cwd: () => cwd,
+    stdout: (chunk) => {
+      stdout += chunk;
+    },
+    stderr: (chunk) => {
+      stderr += chunk;
+    },
+    exit: (code) => {
+      exitCode = code;
+      throw new Error(`cli exit ${code}`);
+    },
+  }).catch((error: unknown) => {
+    if (error instanceof Error && exitCode !== undefined && error.message === `cli exit ${exitCode}`) return;
+    throw error;
+  });
+
+  return { stdout, stderr, exitCode };
+}
+
+afterEach(async () => {
+  await Promise.all(tempRoots.splice(0).map(async (root) => await fsp.rm(root, { recursive: true, force: true })));
+});
+
+describe("duplicate detection", () => {
+  test("reports exact duplicate functions across files", async () => {
+    const root = await makeTempProject();
+    const duplicateSource = `
+export function normalizeInvoiceRows(rows: Array<{ amount: number; tax: number }>) {
+  const totals: number[] = [];
+  const labels: string[] = [];
+  for (const row of rows) {
+    const subtotal = row.amount + row.tax;
+    const rounded = Math.round(subtotal * 100) / 100;
+    const label = rounded > 100 ? "large" : "small";
+    labels.push(label);
+    totals.push(rounded);
+  }
+  const encoded = totals.map((value, index) => labels[index] + ":" + value.toFixed(2));
+  return encoded.filter((value) => value.includes(":")).join(",");
+}
+`;
+
+    await writeProjectFile(root, "src/a.ts", duplicateSource);
+    await writeProjectFile(root, "src/b.ts", duplicateSource);
+
+    const index = await buildProjectIndex(root);
+    const result = await findDuplicates(index, { minConfidence: "high", limit: 5 });
+
+    expect(result.suggestions.length).toBeGreaterThan(0);
+    expect(result.suggestions[0]?.cloneType).toBe("exact");
+    expect(result.suggestions[0]?.confidence).toBe("high");
+    expect(result.suggestions[0]?.left.file).toBe("src/a.ts");
+    expect(result.suggestions[0]?.right.file).toBe("src/b.ts");
+  });
+
+  test("returns bounded suggestions with omission counts", async () => {
+    const root = await makeTempProject();
+    const source = `
+export function summarizePayments(rows: Array<{ amount: number; fee: number }>) {
+  const output: string[] = [];
+  for (const row of rows) {
+    const subtotal = row.amount + row.fee;
+    const rounded = Math.round(subtotal * 100) / 100;
+    const label = rounded > 100 ? "large" : "small";
+    output.push(label + ":" + rounded.toFixed(2));
+  }
+  return output.filter((value) => value.includes(":")).join(",");
+}
+`;
+
+    await writeProjectFile(root, "src/a.ts", source);
+    await writeProjectFile(root, "src/b.ts", source);
+    await writeProjectFile(root, "src/c.ts", source);
+
+    const index = await buildProjectIndex(root);
+    const result = await findDuplicates(index, { minConfidence: "high", limit: 1 });
+
+    expect(result.suggestions).toHaveLength(1);
+    expect(result.omittedCounts.suggestions).toBeGreaterThan(0);
+    expect(result.stats.candidatePairs).toBeGreaterThan(0);
+    expect(result.stats.comparedPairs).toBeGreaterThan(0);
+  });
+
+  test("reports renamed near duplicates through normalized tokens", async () => {
+    const root = await makeTempProject();
+
+    await writeProjectFile(
+      root,
+      "src/users.ts",
+      `
+export function scoreUsers(users: Array<{ active: boolean; points: number }>) {
+  const scored: number[] = [];
+  const labels: string[] = [];
+  for (const user of users) {
+    const base = user.active ? user.points : 0;
+    const bonus = base > 50 ? 10 : 2;
+    const label = bonus > 5 ? "priority" : "standard";
+    labels.push(label);
+    scored.push(base + bonus);
+  }
+  const total = scored.reduce((currentTotal, value) => currentTotal + value, 0);
+  return total + labels.filter((label) => label.length > 0).length;
+}
+`,
+    );
+    await writeProjectFile(
+      root,
+      "src/accounts.ts",
+      `
+export function scoreAccounts(accounts: Array<{ enabled: boolean; credits: number }>) {
+  const values: number[] = [];
+  const tags: string[] = [];
+  for (const account of accounts) {
+    const baseValue = account.enabled ? account.credits : 0;
+    const extra = baseValue > 50 ? 10 : 2;
+    const tag = extra > 5 ? "priority" : "standard";
+    tags.push(tag);
+    values.push(baseValue + extra);
+  }
+  const sum = values.reduce((accumulator, current) => accumulator + current, 0);
+  return sum + tags.filter((tag) => tag.length > 0).length;
+}
+`,
+    );
+
+    const index = await buildProjectIndex(root);
+    const result = await findDuplicates(index, { minConfidence: "medium", limit: 5 });
+    const match = result.suggestions.find(
+      (suggestion) => suggestion.left.file === "src/accounts.ts" || suggestion.right.file === "src/accounts.ts",
+    );
+
+    expect(match).toBeDefined();
+    expect(match?.cloneType === "renamed" || match?.cloneType === "near").toBeTruthy();
+    expect(match?.metrics.tokenJaccard).toBeGreaterThan(0.6);
+  });
+
+  test("does not report matching signatures as high-confidence symbol clones", async () => {
+    const root = await makeTempProject();
+
+    await writeProjectFile(
+      root,
+      "src/a.ts",
+      `
+export function sharedName(input: string): string {
+  const reversed = input.split("").reverse();
+  const upper = reversed.join("").toUpperCase();
+  return upper.slice(0, 12);
+}
+`,
+    );
+    await writeProjectFile(
+      root,
+      "src/b.ts",
+      `
+export function sharedName(input: string): string {
+  const parsed = JSON.parse(input) as { name?: string };
+  if (typeof parsed.name === "string") {
+    return parsed.name.trim().toLowerCase();
+  }
+  return "missing";
+}
+`,
+    );
+
+    const index = await buildProjectIndex(root);
+    const result = await findDuplicates(index, {
+      includeSmall: true,
+      minConfidence: "high",
+    });
+
+    expect(result.suggestions).toHaveLength(0);
+  });
+
+  test("filters small helpers unless explicitly included", async () => {
+    const root = await makeTempProject();
+    const source = `export function sameTiny(value: number) { return value + 1; }\n`;
+
+    await writeProjectFile(root, "a.ts", source);
+    await writeProjectFile(root, "b.ts", source);
+
+    const index = await buildProjectIndex(root);
+    const defaultResult = await findDuplicates(index, { minConfidence: "low" });
+    const includedResult = await findDuplicates(index, { includeSmall: true, minConfidence: "high" });
+
+    expect(defaultResult.suggestions).toHaveLength(0);
+    expect(defaultResult.omittedCounts.belowThresholdUnits).toBeGreaterThan(0);
+    expect(includedResult.suggestions.length).toBeGreaterThan(0);
+  });
+
+  test("rejects invalid token bounds", async () => {
+    const root = await makeTempProject();
+
+    await writeProjectFile(root, "src/a.ts", `export function a() { return 1; }\n`);
+
+    const index = await buildProjectIndex(root);
+    await expect(findDuplicates(index, { minTokens: 20, maxTokens: 10 })).rejects.toThrow(
+      "Expected a value greater than or equal to minTokens",
+    );
+  });
+
+  test("rejects invalid numeric options", async () => {
+    const root = await makeTempProject();
+
+    await writeProjectFile(root, "src/a.ts", `export function a() { return 1; }\n`);
+
+    const index = await buildProjectIndex(root);
+    await expect(findDuplicates(index, { limit: -1 })).rejects.toThrow(
+      'Invalid limit value "-1". Expected a non-negative integer.',
+    );
+    await expect(findDuplicates(index, { minTokens: 0 })).rejects.toThrow(
+      'Invalid minTokens value "0". Expected a positive integer.',
+    );
+    await expect(findDuplicates(index, { shingleSize: Number.NaN })).rejects.toThrow(
+      'Invalid shingleSize value "NaN". Expected a positive integer.',
+    );
+  });
+
+  test("keeps cross-language exact text in separate candidate buckets", async () => {
+    const root = await makeTempProject();
+    const source = `
+export function sharedClone(rows) {
+  const output = [];
+  for (const row of rows) {
+    const subtotal = row.amount + row.fee;
+    const rounded = Math.round(subtotal * 100) / 100;
+    const label = rounded > 100 ? "large" : "small";
+    output.push(label + ":" + rounded.toFixed(2));
+  }
+  return output.filter((value) => value.includes(":")).join(",");
+}
+`;
+
+    await writeProjectFile(root, "src/a.ts", source);
+    await writeProjectFile(root, "src/b.js", source);
+
+    const index = await buildProjectIndex(root);
+    const result = await findDuplicates(index, { includeSmall: true, minConfidence: "high" });
+
+    expect(result.suggestions).toHaveLength(0);
+  });
+
+  test("duplicates CLI detects duplicate JSON text files", async () => {
+    const root = await makeTempProject();
+    const source = JSON.stringify({
+      workflows: [
+        { name: "build", command: "npm run build", retries: 2 },
+        { name: "test", command: "npm run test:ci", retries: 1 },
+        { name: "lint", command: "npm run lint", retries: 1 },
+      ],
+      env: {
+        CI: true,
+        NODE_OPTIONS: "--max-old-space-size=4096",
+      },
+    });
+
+    await writeProjectFile(root, "configs/a.json", source);
+    await writeProjectFile(root, "configs/b.json", source);
+
+    const result = await captureCli(
+      ["duplicates", "--root", ".", "configs", "--min-confidence", "high", "--limit", "1"],
+      root,
+    );
+    const parsed = JSON.parse(result.stdout) as {
+      suggestions?: Array<{ left?: { file?: string; tokenCount?: number }; right?: { file?: string } }>;
+    };
+
+    expect(result.exitCode).toBeUndefined();
+    expect(result.stderr).toBe("");
+    expect(parsed.suggestions).toHaveLength(1);
+    expect(parsed.suggestions?.[0]?.left?.file).toBe("configs/a.json");
+    expect(parsed.suggestions?.[0]?.right?.file).toBe("configs/b.json");
+    expect(parsed.suggestions?.[0]?.left?.tokenCount).toBeGreaterThan(40);
+  });
+
+  test("accepts project-relative file filters", async () => {
+    const root = await makeTempProject();
+    const source = `export function sameTiny(value: number) { return value + 1; }\n`;
+
+    await writeProjectFile(root, "src/a.ts", source);
+    await writeProjectFile(root, "src/b.ts", source);
+
+    const index = await buildProjectIndex(root);
+    const result = await findDuplicates(index, {
+      projectRoot: root,
+      files: ["src/a.ts", "src/b.ts"],
+      includeSmall: true,
+      minConfidence: "high",
+    });
+
+    expect(result.suggestions.length).toBeGreaterThan(0);
+    expect(result.suggestions[0]?.left.file).toBe("src/a.ts");
+  });
+
+  test("rejects duplicate file filters outside the project root", async () => {
+    const root = await makeTempProject();
+    const outsideFile = path.join(path.dirname(root), "outside.ts").replace(/\\/g, "/");
+
+    await writeProjectFile(root, "src/a.ts", `export function a() { return 1; }\n`);
+
+    const index = await buildProjectIndex(root);
+    await expect(
+      findDuplicates(index, {
+        projectRoot: root,
+        files: [outsideFile],
+      }),
+    ).rejects.toThrow("Duplicate input file is outside project root");
+  });
+
+  test("includes same-file non-overlapping clones only when requested", async () => {
+    const root = await makeTempProject();
+    const source = `
+export function firstClone(rows: number[]) {
+  const output: number[] = [];
+  for (const row of rows) {
+    const doubled = row * 2;
+    const adjusted = doubled + 3;
+    output.push(adjusted);
+  }
+  return output.filter((value) => value > 10).join(",");
+}
+
+export function secondClone(rows: number[]) {
+  const output: number[] = [];
+  for (const row of rows) {
+    const doubled = row * 2;
+    const adjusted = doubled + 3;
+    output.push(adjusted);
+  }
+  return output.filter((value) => value > 10).join(",");
+}
+`;
+
+    await writeProjectFile(root, "src/local.ts", source);
+
+    const index = await buildProjectIndex(root);
+    const defaultResult = await findDuplicates(index, { includeSmall: true, minConfidence: "medium" });
+    const sameFileResult = await findDuplicates(index, {
+      includeSmall: true,
+      includeSameFile: true,
+      minConfidence: "medium",
+    });
+
+    expect(defaultResult.suggestions).toHaveLength(0);
+    expect(sameFileResult.suggestions.length).toBeGreaterThan(0);
+    expect(sameFileResult.suggestions[0]?.left.file).toBe("src/local.ts");
+    expect(sameFileResult.suggestions[0]?.right.file).toBe("src/local.ts");
+  });
+
+  test("duplicates CLI emits bounded JSON suggestions", async () => {
+    const root = await makeTempProject();
+    const source = `
+export function summarizeOrders(rows: Array<{ amount: number; tax: number }>) {
+  const output: string[] = [];
+  for (const row of rows) {
+    const subtotal = row.amount + row.tax;
+    const rounded = Math.round(subtotal * 100) / 100;
+    const label = rounded > 100 ? "large" : "small";
+    output.push(label + ":" + rounded.toFixed(2));
+  }
+  return output.filter((value) => value.includes(":")).join(",");
+}
+`;
+
+    await writeProjectFile(root, "src/orders-a.ts", source);
+    await writeProjectFile(root, "src/orders-b.ts", source);
+
+    const result = await captureCli(["duplicates", "src", "--min-confidence", "high", "--limit", "1"], root);
+    const parsed = JSON.parse(result.stdout) as { suggestions?: Array<{ score?: number }> };
+
+    expect(result.exitCode).toBeUndefined();
+    expect(result.stderr).toBe("");
+    expect(parsed.suggestions).toHaveLength(1);
+    expect(parsed.suggestions?.[0]?.score).toBeGreaterThan(90);
+  });
+
+  test("duplicates CLI accepts a zero suggestion limit", async () => {
+    const root = await makeTempProject();
+    const source = `
+export function sameRows(rows: number[]) {
+  const output: number[] = [];
+  for (const row of rows) {
+    output.push(row * 2 + 1);
+  }
+  return output.join(",");
+}
+`;
+
+    await writeProjectFile(root, "src/a.ts", source);
+    await writeProjectFile(root, "src/b.ts", source);
+
+    const result = await captureCli(["duplicates", "--root", ".", "src", "--limit", "0", "--include-small"], root);
+    const parsed = JSON.parse(result.stdout) as {
+      suggestions?: unknown[];
+      omittedCounts?: { suggestions?: number; candidatePairs?: number };
+      stats?: { candidatePairs?: number };
+    };
+
+    expect(result.exitCode).toBeUndefined();
+    expect(result.stderr).toBe("");
+    expect(parsed.suggestions).toHaveLength(0);
+    expect(parsed.omittedCounts?.suggestions).toBeGreaterThan(0);
+    expect(parsed.omittedCounts?.candidatePairs).toBeUndefined();
+    expect(parsed.stats?.candidatePairs).toBeGreaterThan(0);
+  });
+
+  test("counts only considered fingerprints when oversized buckets are skipped", async () => {
+    const root = await makeTempProject();
+    const punctuationBlock = (characters: string, lines: number): string =>
+      Array.from({ length: lines }, (_, line) => {
+        let value = "";
+        for (let offset = 0; offset < 5; offset++) {
+          value += characters[(line * 7 + offset * 3) % characters.length];
+        }
+        return value;
+      }).join("\n");
+    const commonOversizedBlock = punctuationBlock("(){}[]<>+-*/%=!?:;,.|&^~", 40);
+    const sharedEligibleBlock = punctuationBlock("@#$\\_", 8);
+    const leftUniqueBlock = punctuationBlock("(){}[]<>+-*/%=!?:;,.|&^~", 80);
+    const rightUniqueBlock = punctuationBlock("~~~~^^^^||||&&&&!!!!????::::;;;;,,,,....", 80);
+    const thirdUniqueBlock = punctuationBlock("<<<<>>>>====++++----****////%%%%", 10);
+
+    await writeProjectFile(root, "src/index.ts", "export const marker = 1;\n");
+    await writeProjectFile(root, "src/a.txt", `${commonOversizedBlock}\n${sharedEligibleBlock}\n${leftUniqueBlock}\n+`);
+    await writeProjectFile(root, "src/b.txt", `${commonOversizedBlock}\n${sharedEligibleBlock}\n${rightUniqueBlock}\n-`);
+    await writeProjectFile(root, "src/c.txt", `${commonOversizedBlock}\n${thirdUniqueBlock}\n#`);
+
+    const index = await buildProjectIndex(root);
+    const result = await findDuplicates(index, {
+      projectRoot: root,
+      files: ["src/a.txt", "src/b.txt", "src/c.txt"],
+      includeSmall: true,
+      maxBucketSize: 2,
+      minConfidence: "low",
+    });
+
+    expect(result.omittedCounts.oversizedBuckets).toBeGreaterThan(0);
+    expect(
+      result.suggestions.some(
+        (suggestion) => suggestion.left.file === "src/a.txt" && suggestion.right.file === "src/b.txt",
+      ),
+    ).toBeTruthy();
+  });
+
+  test("skips oversized candidate buckets", async () => {
+    const root = await makeTempProject();
+    const source = `
+export function sharedOversizedClone(rows) {
+  const output = [];
+  for (const row of rows) {
+    const subtotal = row.amount + row.fee;
+    const rounded = Math.round(subtotal * 100) / 100;
+    const label = rounded > 100 ? "large" : "small";
+    output.push(label + ":" + rounded.toFixed(2));
+  }
+  return output.filter((value) => value.includes(":")).join(",");
+}
+`;
+
+    await writeProjectFile(root, "src/a.ts", source);
+    await writeProjectFile(root, "src/b.ts", source);
+    await writeProjectFile(root, "src/c.ts", source);
+
+    const index = await buildProjectIndex(root);
+    const result = await findDuplicates(index, {
+      includeSmall: true,
+      maxBucketSize: 2,
+      minConfidence: "high",
+    });
+
+    expect(result.suggestions).toHaveLength(0);
+    expect(result.omittedCounts.oversizedBuckets).toBeGreaterThan(0);
+  });
+});