diff --git a/docs/ai-and-plans/shell-syntax-highlighting/0-plan.md b/docs/ai-and-plans/shell-syntax-highlighting/0-plan.md new file mode 100644 index 000000000..8507a4984 --- /dev/null +++ b/docs/ai-and-plans/shell-syntax-highlighting/0-plan.md @@ -0,0 +1,639 @@ +# Interactive Shell — Input Syntax Highlighting + +## Goal + +Add real-time syntax highlighting to the interactive shell's input line. As the user types a command like `db.users.find({ age: { $gt: 25 } })`, the terminal renders keywords, strings, numbers, operators, BSON constructors, and `$`-prefixed query operators in distinct colors — exactly like the output formatter already colorizes result values. + +## Approach: Monarch Tokenizer Extraction + +Reuse the same JavaScript Monarch tokenizer rules that the query editors already use (via `registerDocumentDBQueryLanguage` in `src/webviews/query-language-support/registerLanguage.ts`), but run them in the extension host (Node.js) instead of in a Monaco Editor. This requires: + +1. **A vendored copy of the Monarch tokenizer rules** — the JS keyword lists, regex patterns, and state machine transitions extracted from `monaco-editor/esm/vs/basic-languages/typescript/typescript.js` + `javascript/javascript.js`. Extended with DocumentDB-specific token categories. +2. **A lightweight Monarch state-machine executor** — runs the tokenizer rules against a plain string and returns token spans with their types, without any Monaco or DOM dependency. +3. **A token-to-ANSI mapper** — converts Monarch token types (`keyword`, `string`, `number`, `comment`, `identifier`, etc.) plus custom DocumentDB types to ANSI 16-color escape sequences. +4. **A line re-rendering function in `ShellInputHandler`** — replaces the current per-character echo approach; on every buffer mutation, rewrites the full line with ANSI colors and repositions the cursor. + +### Why This Approach + +- **Zero new dependencies** — no `emphasize`, `lowlight`, or `highlight.js` added to the bundle. +- **Proven edge-case handling** — the Monarch JS tokenizer correctly handles regex literal disambiguation, template literal interpolation, numeric formats (hex/octal/binary), and escape sequences. The extension's query editors already rely on these exact rules. +- **Native DocumentDB awareness** — BSON constructors, `$`-prefixed operators, and shell commands are added as first-class token categories in the rule set, not bolted on via post-processing. +- **Architectural consistency** — the query editor surface and the shell surface share the same token classification rules, producing visually consistent highlighting. + +--- + +## File Plan + +All new files go under `src/documentdb/shell/highlighting/`: + +``` +src/documentdb/shell/highlighting/ +├── monarchRules.ts # WI-1: Vendored + extended tokenizer rules +├── monarchRunner.ts # WI-2: State-machine executor +├── tokenColorizer.ts # WI-3: Token-to-ANSI mapper +├── monarchRunner.test.ts # WI-2 tests +├── tokenColorizer.test.ts # WI-3 tests +└── shellHighlighter.test.ts # WI-4 integrated tests +``` + +Modified files: + +- `src/documentdb/shell/ShellInputHandler.ts` — WI-4: re-rendering infrastructure +- `src/documentdb/shell/ShellInputHandler.test.ts` — WI-4: updated tests + +--- + +## Work Items + +### WI-1: Vendored Monarch Rules (`monarchRules.ts`) — ✅ DONE + +**Goal:** Create a standalone copy of the JavaScript Monarch tokenizer rules, extended with DocumentDB-specific token categories. No runtime dependency on `monaco-editor`. + +**Source material:** The tokenizer rules live in two files in `node_modules/monaco-editor/esm/vs/basic-languages/`: + +- `typescript/typescript.ts` — contains the actual `tokenizer` state machine (states: `root`, `common`, `whitespace`, `comment`, `jsdoc`, `regexp`, `regexrange`, `string_double`, `string_single`, `string_backtick`, `bracketCounting`), plus named regex patterns (`symbols`, `escapes`, `digits`, `octaldigits`, `binarydigits`, `hexdigits`, `regexpctl`, `regexpesc`), and the `operators` list. +- `javascript/javascript.ts` — overrides `keywords` (removes TypeScript-only keywords like `interface`, `enum`, `declare`, etc.) and `typeKeywords` (empty array). Everything else delegates to the TypeScript rules. + +**What to extract:** The combined JavaScript variant of the rules — the JS `keywords` list + the TS tokenizer/patterns/operators. This produces the self-contained data structure the executor needs. + +**DocumentDB extensions — add these custom token categories:** + +1. **BSON constructors** — A `bsonConstructors` string array: + + ``` + ObjectId, ISODate, NumberLong, NumberInt, NumberDecimal, + BinData, UUID, Timestamp, MinKey, MaxKey + ``` + + These names come from `packages/documentdb-constants/src/bsonConstructors.ts`. + In the `common` state, the existing `[/[A-Z][\w\$]*/, "type.identifier"]` rule already matches these PascalCase names. Add a `cases` branch that checks `@bsonConstructors` and emits `"bson.constructor"` instead of `"type.identifier"`. + +2. **Shell commands** — A `shellCommands` string array: `["show", "use", "it", "exit", "quit", "cls", "clear", "help"]`. Add a `cases` branch in the lowercase identifier rule (`/#?[a-z_$][\w$]*/`) so `@shellCommands -> "shell.command"` is checked before `@keywords`. + +3. **`$`-prefixed operators** — Add a new rule before the general identifier rule: + ``` + [/\$[a-zA-Z_]\w*/, "documentdb.operator"] + ``` + This matches `$gt`, `$match`, `$lookup`, etc. It fires before the general identifier rule because Monarch rules are matched in order. + +**Output shape:** + +```typescript +export interface MonarchLanguageRules { + keywords: string[]; + bsonConstructors: string[]; + shellCommands: string[]; + operators: string[]; + symbols: RegExp; + escapes: RegExp; + digits: RegExp; + octaldigits: RegExp; + binarydigits: RegExp; + hexdigits: RegExp; + regexpctl: RegExp; + regexpesc: RegExp; + tokenizer: Record; +} +``` + +Each `MonarchRule` is one of: + +- `[RegExp, string]` — match regex, emit token type +- `[RegExp, { cases: Record }]` — match regex, emit based on case lookup +- `[RegExp, string, string]` — match regex, emit token, push state +- `[RegExp, { token: string, next: string }]` — match regex, emit token, push/pop state +- `[RegExp, string[]]` — match regex, emit array of tokens (one per capture group) +- `{ include: string }` — include another state's rules + +> **DEVIATION (WI-1 — Rule Types):** The `MonarchRule` type uses **named properties** (`{ regex, action }`, `{ regex, actionCases }`, `{ regex, actionByGroup }`, `{ include }`) instead of positional tuples (`[RegExp, string]`). This makes the executor's pattern-matching simpler and produces self-documenting code. +> +> **Alternatives analyzed:** +> +> 1. **Tuple arrays (as planned):** Pro: closer to Monaco's internal format. Con: requires index-based discrimination (`rule.length === 2` vs `3`) which is brittle. Con: `[RegExp, string | { cases } | string[]]` union is hard to narrow. +> 2. **Named properties (chosen):** Pro: explicit `'actionCases' in rule` checks. Pro: easier to read and maintain. Con: slightly more verbose than tuple literals. +> 3. **Tagged union with `kind` discriminant:** Pro: perfect type narrowing. Con: over-engineering; `'field' in obj` checks work fine for 4 variants. + +**Important:** All regex patterns from the Monaco source use `@name` references (e.g., `@digits`, `@escapes`). The executor (WI-2) must resolve these at init time by replacing `@name` in the pattern source with the corresponding regex source string before compiling. + +> **DEVIATION (WI-1):** Instead of keeping `@name` references in regex source strings and resolving them in the executor, all regex patterns were **inlined directly** in `monarchRules.ts`. For example, `/(@digits)[eE]([\-+]?(@digits))?/` became `/(\d+(_+\d+)*)[eE]([\-+]?(\d+(_+\d+)*))?/`. Patterns like `escapes`, `regexpctl`, `regexpesc` are used as standalone `RegExp` objects directly in string/regexp state rules. +> +> **Reasoning:** Eliminates the need for regex source string manipulation in the executor — the most error-prone step. +> +> **Alternatives analyzed:** +> +> 1. **Keep `@name` references (as planned):** +> - Pro: Faithful to Monaco Monarch format; easier to diff against upstream. +> - Pro: Single source of truth for named patterns. +> - Con: Requires non-trivial regex-source-string replacement at init time (string→RegExp→string round-trip is fragile). +> 2. **Inline patterns (chosen):** +> - Pro: Simpler executor — no resolution step, fewer moving parts. +> - Pro: Direct `RegExp` objects avoid regex compilation bugs from malformed source splicing. +> - Con: Patterns are duplicated (but they're constants that never change). +> 3. **Build a pre-compilation step that resolves at build time:** +> - Pro: Best of both worlds — faithful source and no runtime cost. +> - Con: Adds a build-time dependency and makes the code harder to understand. +> - Con: Over-engineering for a set of fixed patterns. + +**Licensing:** The Monaco Editor source is MIT-licensed. Include the Monaco license header in the file comment. + +--- + +### WI-2: Monarch State-Machine Executor (`monarchRunner.ts`) — ✅ DONE + +**Goal:** A function that takes a string and the `MonarchLanguageRules`, runs the tokenizer state machine, and returns an array of `(startOffset, endOffset, tokenType)` spans. + +**API:** + +```typescript +export interface TokenSpan { + start: number; + end: number; + type: string; // e.g., "keyword", "string", "number", "bson.constructor", etc. +} + +export function tokenize(input: string, rules: MonarchLanguageRules): TokenSpan[]; +``` + +**How Monarch tokenizer rules work (simplified for our needs):** + +The tokenizer is a set of named **states** (e.g., `"root"`, `"common"`, `"string_double"`). Each state is an ordered array of rules. Processing starts in state `"root"`. + +At each position in the input: + +1. Try each rule in the current state, in order. +2. If a rule's regex matches at the current position (anchored via `lastIndex`), consume the matched text and emit the token type. +3. If the rule has a `next` action: + - `"@pop"` — pop the state stack (return to parent state). + - `"@stateName"` — push `stateName` onto the state stack and transition. + - Just a state name string — same as `@stateName`. +4. If the rule is `{ include: "@stateName" }`, splice that state's rules into the current position (or just recurse). +5. If no rule matches, consume one character with the `defaultToken` type (`"invalid"`) and stay in the current state. This prevents infinite loops. + +**`@name` regex resolution:** + +Before running, preprocess all regex patterns in the tokenizer rules. For any `@name` reference in a regex source (e.g., `(@digits)` in `/(@digits)[eE]/`), replace `@name` with the source of the corresponding named pattern from the rules object. Then compile the final regex. Cache the compiled regexes — they don't change between calls. + +**`cases` resolution:** + +When a rule has `cases: { "@keywords": "keyword", "@default": "identifier" }`, the executor must: + +1. Look up the matched text in the array named by the `@`-prefixed key (e.g., check if the matched text is in `rules.keywords`). +2. If found, use that token type. +3. Otherwise, use `@default`. + +**State stack:** Use a simple array. Max depth: 32 (guard against infinite recursion from malformed rules or adversarial input). + +**Performance requirements:** + +- The function is called on every keystroke for the current line buffer (typically 1–200 characters). +- Target: < 0.5ms for a 200-character line. The Monarch rules are pre-compiled regexes, so this is straightforward — no allocation-heavy parsing. +- Memoize the previous (input, result) pair. If the input hasn't changed (cursor-only movements), return the cached result. + +**Edge cases to handle:** + +- Empty input → return empty array. +- Input that is entirely inside a string or comment (e.g., an unterminated `"hello`) → the tokenizer state stack will be non-empty at the end, but that is expected and correct. +- The executor does NOT need to persist state across lines. Each call is stateless (the shell is a single-line input; multi-line mode is accumulated in `_multiLineBuffer` and each line is highlighted independently). + +**Tests (`monarchRunner.test.ts`):** + +Write tests for these categories, using a `describe('MonarchRunner', ...)` block: + +| Category | Example Input | Expected Tokens | +| ------------------------ | ---------------------------------- | ------------------------------------------------------------------------------------- | +| Keywords | `const x = 1` | `keyword("const"), identifier("x"), delimiter("="), number("1")` | +| Strings | `"hello world"` | `string('"hello world"')` | +| Single-quoted strings | `'hello'` | `string("'hello'")` | +| Template literals | `` `hello ${name}` `` | `string, delimiter.bracket, identifier, delimiter.bracket, string` | +| Numbers (int) | `42` | `number("42")` | +| Numbers (float) | `3.14` | `number.float("3.14")` | +| Numbers (hex) | `0xFF` | `number.hex("0xFF")` | +| Comments (line) | `// a comment` | `comment("// a comment")` | +| Comments (block) | `/* block */` | `comment("/* block */")` | +| Regex literals | `/^hello/i` | `regexp("/^hello/"), keyword.other("i")` | +| BSON constructors | `ObjectId("abc")` | `bson.constructor("ObjectId"), ...` | +| DocumentDB API operators | `{ $gt: 5 }` | `delimiter.bracket, documentdb.operator("$gt"), delimiter, number, delimiter.bracket` | +| Shell commands | `show dbs` | `shell.command("show"), identifier("dbs")` | +| Mixed | `db.users.find({ name: "alice" })` | Each token classified correctly | +| Empty input | `""` | `[]` | + +--- + +### WI-3: Token-to-ANSI Mapper (`tokenColorizer.ts`) — ✅ DONE + +**Goal:** Convert an array of `TokenSpan` and the original input string into an ANSI-colorized string suitable for writing to the terminal via `Pseudoterminal.onDidWrite`. + +**API:** + +```typescript +export function colorizeInput(input: string, tokens: TokenSpan[]): string; +``` + +**Color mapping:** + +Use the same ANSI 16-color palette already established by `ShellOutputFormatter` (see `src/documentdb/shell/ShellOutputFormatter.ts`). These colors respect the user's terminal theme via VS Code's `terminal.ansi*` theme colors. + +| Token Type | ANSI Code | Color | Rationale | +| --------------------------------------------------------------------------- | ---------- | ------- | --------------------------------------- | +| `keyword` | `\x1b[36m` | Cyan | Matches JS keyword convention | +| `string` | `\x1b[32m` | Green | Matches output formatter's string color | +| `string.escape` | `\x1b[33m` | Yellow | Escape sequences stand out | +| `string.invalid` | `\x1b[31m` | Red | Unterminated strings | +| `number` / `number.float` / `number.hex` / `number.octal` / `number.binary` | `\x1b[33m` | Yellow | Matches output formatter's number color | +| `comment` / `comment.doc` | `\x1b[90m` | Gray | Subdued | +| `regexp` | `\x1b[31m` | Red | Distinct from strings | +| `bson.constructor` | `\x1b[36m` | Cyan | Highlighted as built-in constructors | +| `documentdb.operator` | `\x1b[33m` | Yellow | Stand out within query objects | +| `shell.command` | `\x1b[35m` | Magenta | Visually distinct from JS keywords | +| `type.identifier` | (no color) | Default | PascalCase identifiers (non-BSON) | +| `identifier` | (no color) | Default | Regular identifiers | +| `delimiter` / `delimiter.bracket` | (no color) | Default | Punctuation | +| (all others / `invalid`) | (no color) | Default | Don't colorize unknown tokens | + +The function builds the output string by iterating tokens in order: + +1. For each token, if the token type has a color, emit `{colorCode}{text}\x1b[0m`. +2. If the token type has no color, emit the raw text. +3. If there are gaps between tokens (shouldn't happen with a correct tokenizer, but guard defensively), emit the gap text uncolored. + +**ANSI reset:** Every colored span must be followed by `\x1b[0m` (reset) so colors don't bleed into adjacent tokens. This is cheap (4 bytes per colored token) and prevents visual corruption. + +**Tests (`tokenColorizer.test.ts`):** + +| Test | Input | Assertion | +| ----------------------------------- | ------------------------------ | ----------------------------------------- | +| Keywords get cyan | `const` as keyword token | Output contains `\x1b[36mconst\x1b[0m` | +| Strings get green | `"hello"` as string token | Output contains `\x1b[32m"hello"\x1b[0m` | +| Numbers get yellow | `42` as number token | Output contains `\x1b[33m42\x1b[0m` | +| Identifiers uncolored | `foo` as identifier token | Output is `foo` (no ANSI) | +| BSON constructors get cyan | `ObjectId` as bson.constructor | Output contains `\x1b[36mObjectId\x1b[0m` | +| Shell commands get magenta | `show` as shell.command | Output contains `\x1b[35mshow\x1b[0m` | +| DocumentDB API operators get yellow | `$gt` as documentdb.operator | Output contains `\x1b[33m$gt\x1b[0m` | +| Empty input | `""` | Output is `""` | +| Full line integration | `db.users.find({ $gt: 1 })` | Correct colors for each token | + +--- + +### WI-4: ShellInputHandler Re-Rendering (`ShellInputHandler.ts` modifications) — ✅ DONE + +**Goal:** Replace the current per-character echo approach with full-line re-rendering so syntax highlighting applies on every buffer mutation. + +#### 4a. Add highlighting dependency + +Add an optional `colorize` callback to `ShellInputHandlerCallbacks`: + +```typescript +export interface ShellInputHandlerCallbacks { + write: (data: string) => void; + onLine: (line: string) => void; + onInterrupt: () => void; + onContinuation: () => void; + /** Optional: colorize the input buffer for syntax highlighting. */ + colorize?: (input: string) => string; +} +``` + +When `colorize` is not provided (or returns the input unchanged), behavior is identical to today — no highlighting. This preserves backward compatibility and makes testing easier. + +#### 4b. Add `reRenderLine()` method + +This is the core rendering function. It replaces the current approach where each editing method (`insertCharacter`, `handleBackspace`, `clearBeforeCursor`, `deleteWordBeforeCursor`, `handleDelete`, `clearAfterCursor`, `replaceLineWith`) independently writes ANSI sequences to echo its specific change. + +``` +reRenderLine(): + 1. Move cursor to column 0 of the input area (column = prompt width). + → write `\r` + `\x1b[{promptWidth}C` (or just `\r` + prompt-width spaces) + Actually, since we don't re-render the prompt, the simplest approach is: + → Move cursor left by `_cursor` positions to reach input start: `\x1b[{_cursor}D` + But _cursor may be 0 (e.g., after clearBeforeCursor). Instead: + → Use `\r` (carriage return to column 0) + `\x1b[{promptWidth}C` (move right past prompt). + 2. Write the colorized buffer: + → if `colorize` callback exists: `colorize(_buffer)` + → else: `_buffer` + 3. Erase any leftover characters from the previous (longer) buffer: + → write `\x1b[K` (erase from cursor to end of line) + 4. Reposition cursor to the correct position: + → The colorized string contains ANSI escape codes (zero-width), so the + visual cursor is now at `_buffer.length` (the end of the input text). + → Move it back by `(_buffer.length - _cursor)` positions if cursor is + not at the end. + → `\x1b[{_buffer.length - _cursor}D` +``` + +**`_promptWidth` field:** Add a `_promptWidth: number` field to `ShellInputHandler`. The existing `setPromptWidth(width)` method (currently a no-op reserved for future use) stores this value. `DocumentDBShellPty` must call `setPromptWidth()` with the visual width of the prompt string (e.g., `"mydb> ".length`) before showing each prompt. + +#### 4c. Rewire all buffer mutation methods + +Replace the manual ANSI echo logic in each method with a call to `reRenderLine()`: + +| Method | Current behavior | New behavior | +| -------------------------- | -------------------------------------------------- | ----------------------------------------------------- | +| `insertCharacter(ch)` | Writes `ch + after + backspaces` | Update `_buffer` and `_cursor`, then `reRenderLine()` | +| `handleBackspace()` | Writes `\b + after + space + backspaces` | Update `_buffer` and `_cursor`, then `reRenderLine()` | +| `handleDelete()` | Writes `after + space + backspaces` | Update `_buffer` and `_cursor`, then `reRenderLine()` | +| `clearBeforeCursor()` | Writes cursor-left + after + spaces + backspaces | Update `_buffer` and `_cursor`, then `reRenderLine()` | +| `clearAfterCursor()` | Writes `ERASE_TO_EOL` | Update `_buffer`, then `reRenderLine()` | +| `deleteWordBeforeCursor()` | Writes cursor-left + after + spaces + backspaces | Update `_buffer` and `_cursor`, then `reRenderLine()` | +| `replaceLineWith(newText)` | Writes cursor-left + newText + spaces + backspaces | Update `_buffer` and `_cursor`, then `reRenderLine()` | + +Each method becomes simpler: just do the buffer/cursor mutation logic, then call `reRenderLine()`. All ANSI complexity is centralized in one place. + +**Cursor-only movement (`moveCursorLeft`, `moveCursorRight`, `moveCursorTo`, `wordLeft`, `wordRight`):** These do NOT call `reRenderLine()`. They update `_cursor` and write cursor-movement ANSI sequences directly, as they do today. No buffer content changes → no re-tokenization needed. + +#### 4d. Wire up in `DocumentDBShellPty` + +In `DocumentDBShellPty`'s constructor (where the `ShellInputHandler` is created), provide the `colorize` callback: + +```typescript +import { tokenize } from './highlighting/monarchRunner'; +import { colorizeInput } from './highlighting/tokenColorizer'; +import { shellLanguageRules } from './highlighting/monarchRules'; + +// In constructor: +this._inputHandler = new ShellInputHandler({ + write: (data: string) => this._writeEmitter.fire(data), + onLine: (line: string) => void this.handleLineInput(line), + onInterrupt: () => this.handleInterrupt(), + onContinuation: () => this.showContinuationPrompt(), + colorize: (input: string) => { + if (!this.isColorEnabled()) { + return input; + } + const tokens = tokenize(input, shellLanguageRules); + return colorizeInput(input, tokens); + }, +}); +``` + +Also wire up `setPromptWidth()`: + +In the `showPrompt()` method (and `showContinuationPrompt()`), after writing the prompt string, call: + +```typescript +this._inputHandler.setPromptWidth(promptString.length); +``` + +The prompt string is currently something like `"mydb> "` — its visual width (without ANSI codes) must be measured. + +#### 4e. Update existing tests + +The `ShellInputHandler.test.ts` tests assert on exact `write()` output. With the re-rendering approach, the output format changes from incremental character echoes to full-line rewrites. + +**Strategy:** Add the `colorize` callback in a subset of tests to verify highlighting works. For existing tests that don't care about highlighting, pass no `colorize` callback — they should continue passing with updated output assertions that match the `reRenderLine()` sequence instead of the old per-character echo. + +**New integrated test file (`shellHighlighter.test.ts`):** + +End-to-end tests that create a `ShellInputHandler` with the real highlighting pipeline and verify that typing sequences produce correctly colorized output: + +| Test | Action | Assertion | +| ------------------------- | ------------------------------------ | -------------------------------------------------------------------------------------------------------------------------- | +| Typing a keyword | Type `c`, `o`, `n`, `s`, `t` | After each keystroke, the re-rendered line shows the partial input. After `t`, "const" is highlighted as a keyword (cyan). | +| Typing a string | Type `"`, `h`, `i`, `"` | After `"`, the `"` is in string color. After `"hi"` is complete, the entire string is green. | +| Typing a BSON constructor | Type `O`, `b`, `j`, ... `d` | After completing "ObjectId", the word shows as a BSON constructor (cyan). | +| Backspace mid-word | Type `const`, backspace 2, type `le` | Result: `conle`, no keyword highlighting (it's not a keyword). | +| History recall | Type `db.find()`, Enter, Up arrow | The recalled line is re-rendered with highlighting. | +| Clear line (Ctrl+U) | Type `db.find()`, Ctrl+U | Line is empty, no highlighting output. | + +--- + +## Implementation Sequence + +``` +WI-1 (monarchRules.ts) + ↓ +WI-2 (monarchRunner.ts + tests) — depends on WI-1 + ↓ +WI-3 (tokenColorizer.ts + tests) — depends on WI-2 interface only + ↓ +WI-4 (ShellInputHandler changes + integration tests) — depends on WI-2 + WI-3 +``` + +WI-1 and WI-3 can be developed in parallel since WI-3 only depends on the `TokenSpan` interface, not the actual rules. + +--- + +## Completion Checklist + +Before marking this feature complete: + +- [x] `npm run build` succeeds +- [x] `npm run lint` passes (1 pre-existing error in DocumentDBClusterItem.ts, unrelated) +- [x] `npm run prettier-fix` has been run +- [x] All new files have the Microsoft copyright header +- [x] All new tests pass (253 tests across 9 test suites) +- [x] Existing `ShellInputHandler.test.ts` tests pass (2 assertions updated for re-render format) +- [x] Existing `DocumentDBShellPty.test.ts` tests pass +- [x] No new `any` types +- [x] No references to product names other than "DocumentDB" and "DocumentDB API" in code, comments, and test descriptions +- [x] The `documentDB.shell.display.colorOutput` setting (already exists) gates highlighting — when `false`, the `colorize` callback returns the input unchanged +- [x] Bundle size has not increased (no new dependencies) + +--- + +## Manual Test Plan + +Open a DocumentDB Interactive Shell terminal (`DocumentDB: Open Shell`) connected to any database. The prompt should appear as `dbname> `. Verify each scenario below. + +### Prerequisites + +- The setting `documentDB.shell.display.colorOutput` is **true** (default). +- The terminal uses a theme with visible ANSI 16-color support (any VS Code default theme works). + +### Color Reference + +| Color | ANSI | Applied To | +| ------- | ---- | -------------------------------------- | +| Cyan | 36 | JS keywords, BSON constructors | +| Green | 32 | Strings | +| Yellow | 33 | Numbers, `$`-operators, escape seqs | +| Magenta | 35 | Shell commands | +| Gray | 90 | Comments | +| Red | 31 | Regex literals, unterminated strings | +| Default | — | Identifiers, delimiters, brackets, `.` | + +--- + +### T-01: JS Keywords (Cyan) + +| Step | Action | Expected | +| ---- | ------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------- | +| 1 | Type `const x = 1` | `const` is cyan; `x`, `=` are default; `1` is yellow | +| 2 | Clear line (Ctrl+U), type `if (true) { return false }` | `if`, `true`, `return`, `false` are all cyan; parens/braces are default | +| 3 | Type `let`, `var`, `function`, `new`, `this`, `null`, `undefined` (one at a time, clear between) | Each is cyan | + +### T-02: Strings — Double Quotes (Green) + +| Step | Action | Expected | +| ---- | --------------------------------------- | --------------------------------------------------------- | +| 1 | Type `"hello world"` | Entire `"hello world"` is green | +| 2 | Type `"with \"escape\""` | String content is green; `\"` escape sequences are yellow | +| 3 | Type `"unterminated` (no closing quote) | Text appears red (string.invalid) | + +### T-03: Strings — Single Quotes (Green) + +| Step | Action | Expected | +| ---- | -------------------- | ---------------------------------------------- | +| 1 | Type `'hello world'` | Entire `'hello world'` is green | +| 2 | Type `'it\'s'` | String content is green; `\'` escape is yellow | +| 3 | Type `'unterminated` | Text appears red (string.invalid) | + +### T-04: Template Literals (Green + Default for interpolation) + +| Step | Action | Expected | +| ---- | -------------------------- | -------------------------------------------------------------------------------------------------------- | +| 1 | Type `` `hello` `` | String parts are green | +| 2 | Type `` `hello ${name}` `` | `` `hello `` and `` ` `` are green; `${` and `}` are default (delimiter); `name` is default (identifier) | + +### T-05: Numbers (Yellow) + +| Step | Action | Expected | +| ---- | ------------- | -------- | +| 1 | Type `42` | Yellow | +| 2 | Type `3.14` | Yellow | +| 3 | Type `1e10` | Yellow | +| 4 | Type `0xFF` | Yellow | +| 5 | Type `0o77` | Yellow | +| 6 | Type `0b1010` | Yellow | + +### T-06: Comments (Gray) + +| Step | Action | Expected | +| ---- | --------------------------- | ---------------------------------------------- | +| 1 | Type `// this is a comment` | Entire line is gray | +| 2 | Type `/* block comment */` | Entire comment is gray | +| 3 | Type `x = 1 // inline` | `x`, `=` default; `1` yellow; `// inline` gray | + +### T-07: BSON Constructors (Cyan) + +| Step | Action | Expected | +| ---- | ---------------------------- | ------------------------------------------- | +| 1 | Type `ObjectId("abc123")` | `ObjectId` is cyan; `"abc123"` is green | +| 2 | Type `ISODate("2025-01-01")` | `ISODate` is cyan; string is green | +| 3 | Type `NumberLong(42)` | `NumberLong` is cyan; `42` is yellow | +| 4 | Type `NumberInt(1)` | `NumberInt` is cyan | +| 5 | Type `NumberDecimal("3.14")` | `NumberDecimal` is cyan; string is green | +| 6 | Type `UUID("abc")` | `UUID` is cyan | +| 7 | Type `Timestamp(1, 0)` | `Timestamp` is cyan; numbers are yellow | +| 8 | Type `MinKey()` | `MinKey` is cyan | +| 9 | Type `MaxKey()` | `MaxKey` is cyan | +| 10 | Type `BinData(0, "abc")` | `BinData` is cyan; `0` yellow; string green | + +### T-08: Non-BSON PascalCase (Default) + +| Step | Action | Expected | +| ---- | ------------------- | ------------------------ | +| 1 | Type `MyClass` | Default color (not cyan) | +| 2 | Type `SomeFunction` | Default color | + +### T-09: `$`-Prefixed DocumentDB Operators (Yellow) + +| Step | Action | Expected | +| ---- | -------------------------- | ------------------------------------------------------------------------ | +| 1 | Type `{ $gt: 5 }` | `$gt` is yellow; `5` is yellow; `{`, `}`, `:` are default | +| 2 | Type `{ $match: {} }` | `$match` is yellow | +| 3 | Type `{ $lookup: {} }` | `$lookup` is yellow | +| 4 | Type `{ $regex: /test/i }` | `$regex` is yellow; `/test/` is red (regex); `i` is cyan (keyword.other) | +| 5 | Type `$group` alone | Yellow | + +### T-10: Shell Commands (Magenta) + +| Step | Action | Expected | +| ---- | --------------- | ----------------------------------- | +| 1 | Type `show dbs` | `show` is magenta; `dbs` is default | +| 2 | Type `use mydb` | `use` is magenta; `mydb` is default | +| 3 | Type `it` | Magenta | +| 4 | Type `exit` | Magenta | +| 5 | Type `quit` | Magenta | +| 6 | Type `help` | Magenta | +| 7 | Type `cls` | Magenta | +| 8 | Type `clear` | Magenta | + +### T-11: Identifiers and Delimiters (Default / Uncolored) + +| Step | Action | Expected | +| ---- | ---------------------- | ----------------------------------------------------------------------- | +| 1 | Type `db.users.find()` | `db`, `users`, `find` are default; `.` is default; `(`, `)` are default | +| 2 | Type `x = y + z` | `x`, `y`, `z` are default; `=`, `+` are default | +| 3 | Type `[1, 2, 3]` | `[`, `]`, `,` are default; `1` `2` `3` are yellow | + +### T-12: Realistic Query Expressions + +| Step | Action | Expected | +| ---- | --------------------------------------------------------------- | -------------------------------------------------------------------------------------------- | +| 1 | Type `db.users.find({ age: { $gt: 25 } })` | `db`, `users`, `find`, `age` default; `$gt` yellow; `25` yellow; brackets/delimiters default | +| 2 | Type `db.orders.aggregate([{ $match: { status: "active" } }])` | `$match` yellow; `"active"` green; `status` default; all brackets default | +| 3 | Type `db.test.insertOne({ name: "alice", age: NumberInt(30) })` | `"alice"` green; `NumberInt` cyan; `30` yellow; `name`, `age` default | +| 4 | Type `db.test.updateMany({}, { $set: { flag: true } })` | `$set` yellow; `true` cyan; rest default | + +### T-13: Editing and Re-highlighting + +| Step | Action | Expected | +| ---- | ---------------------------------------------------------------- | --------------------------------------------------------- | +| 1 | Type `const` (cyan), then Backspace twice → `con` | `con` loses cyan (not a keyword) | +| 2 | Continue typing `sole` → `console` | `console` is default (not a keyword) | +| 3 | Type `show`, then Home, type `x` → `xshow` | `xshow` is default (not a shell command) | +| 4 | Type `"hello"`, move cursor into string, insert `X` → `"helXlo"` | Entire string stays green | +| 5 | Type `$gt`, then Backspace all → empty | Prompt reappears clean with no leftover colors | +| 6 | Type `db.find()`, then Ctrl+U (clear before cursor) | Line clears, no residual colored text | +| 7 | Type `db.find()`, then Home, Ctrl+K (clear after cursor) | Only characters before cursor remain, rest erased cleanly | + +### T-14: History Recall + +| Step | Action | Expected | +| ---- | --------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------- | +| 1 | Type `const x = 1` and press Enter | Command executes | +| 2 | Press Up arrow | Recalled line `const x = 1` appears with highlighting: `const` cyan, `1` yellow | +| 3 | Press Down arrow | Line clears back to empty (no residual colors) | +| 4 | Type `db.test.find({\n age: 25\n})` as multi-line (press Enter after `{`), then complete with `})` | After submission, press Up; recalled flattened line `db.test.find({ age: 25 })` has `25` in yellow | + +### T-15: Multi-Line Continuation + +| Step | Action | Expected | +| ---- | --------------------------------------- | ---------------------------------- | +| 1 | Type `db.test.find({` and press Enter | Continuation prompt `⋯ > ` appears | +| 2 | On the continuation line, type `$gt: 5` | `$gt` is yellow; `5` is yellow | +| 3 | Type `})` and press Enter | Expression executes | + +### T-16: Color Output Setting Gate + +| Step | Action | Expected | +| ---- | --------------------------------------------------------- | ---------------------------------------------- | +| 1 | Set `documentDB.shell.display.colorOutput` to `false` | — | +| 2 | Open a new shell terminal | — | +| 3 | Type `const x = 1` | No colors — all text is default terminal color | +| 4 | Type `show dbs` | No colors — "show" is default | +| 5 | Set `documentDB.shell.display.colorOutput` back to `true` | — | +| 6 | Open a new shell terminal | — | +| 7 | Type `const x = 1` | `const` is cyan, `1` is yellow | + +### T-17: Cursor Positioning After Re-render + +| Step | Action | Expected | +| ---- | ----------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------- | +| 1 | Type `abcdef`, press Home, type `X` → `Xabcdef` | Cursor is at position 1 (after `X`); remaining text not corrupted | +| 2 | Type `hello world`, press Left 5 times (cursor on `w`), type `Z` → `hello Zworld` | Cursor stays after `Z`; both words visible | +| 3 | Type `const`, press Home, press Right twice (cursor on `n`), then Delete → `cost` | `cost` is not a keyword — no cyan | +| 4 | Type a long line (~80+ chars), e.g. `db.mycollection.find({ name: "a very long string value here" })` | No wrapping artifacts; cursor tracks correctly at end | + +### T-18: Paste Behavior + +| Step | Action | Expected | +| ---- | ------------------------------------------------- | -------------------------------------------------------------------------------------------- | +| 1 | Paste `db.users.find({ $gt: 1 })` from clipboard | Entire line appears with correct highlighting in one render | +| 2 | Paste multi-line: `db.test.find({\n age: 25\n})` | First line triggers continuation; each continuation line is highlighted; final line executes | + +### T-19: Ctrl+C Interrupt + +| Step | Action | Expected | +| ---- | ---------------------------------------------- | ------------------------------------------------------------------ | +| 1 | Type `db.test.find({` to enter multi-line mode | Continuation prompt appears | +| 2 | Press Ctrl+C | Multi-line cancelled; new prompt appears; no residual colored text | +| 3 | Type `show` after the fresh prompt | `show` appears in magenta normally | + +### T-20: Empty and Whitespace Input + +| Step | Action | Expected | +| ---- | ---------------------------------------- | ------------------------------------------------- | +| 1 | Press Enter on empty prompt | New prompt, no errors | +| 2 | Type only spaces ` ` | No colors applied (whitespace has no token color) | +| 3 | Type ` const ` (spaces around keyword) | `const` is cyan; spaces are uncolored | diff --git a/docs/ai-and-plans/shell-syntax-highlighting/pr-review.md b/docs/ai-and-plans/shell-syntax-highlighting/pr-review.md new file mode 100644 index 000000000..5ff6b6c00 --- /dev/null +++ b/docs/ai-and-plans/shell-syntax-highlighting/pr-review.md @@ -0,0 +1,315 @@ +# PR #580 Review — Interactive Shell with Syntax Highlighting + +**PR:** https://github.com/microsoft/vscode-documentdb/pull/580 +**Reviewer:** Agent (automated review against implementation plan) +**Date:** 2026-04-15 +**Copilot Reviewer Comments:** 4 comments submitted (merged below) + +--- + +## Overall Assessment + +The implementation faithfully follows the plan from `0-plan.md`. All 4 work items are completed, all 80 tests pass, and the architecture matches the plan's design. The deviations (named rule properties instead of tuples, inlined regex patterns, `colorizeShellInput.ts` convenience wrapper) are all improvements over the original plan and are documented in the plan itself. + +**File structure matches plan:** ✅ (with one minor location difference — see I-06) +**All plan work items implemented:** ✅ (WI-1 through WI-4) +**Tests cover plan requirements:** ✅ (all categories from plan tables) +**Color mapping matches plan spec:** ✅ (all ANSI codes correct) +**Setting gate (`colorOutput`) wired:** ✅ +**No new dependencies:** ✅ +**Copyright headers present:** ✅ +**Monaco license attribution:** ✅ + +--- + +## Issues + +### I-01: `regexpesc` regex contains spurious space — **Severity: HIGH (Bug)** + +**File:** `src/documentdb/shell/highlighting/monarchRules.ts` line 183 + +The `regexpesc` pattern contains a space between the character class and the `|` alternation: + +``` +/\\(?:[bBdDfnrstvwWn0\\\/]|[(){}\[\]\$\^|\-*+?\.] |c[A-Z]|x[0-9a-fA-F]{2}|u[0-9a-fA-F]{4})/ + ^-- spurious space +``` + +This means the second alternative matches "a special regex control character **followed by a literal space**" rather than just "a special regex control character". The correct pattern (matching the Monaco source where `@regexpctl` is inlined) should be: + +``` +/\\(?:[bBdDfnrstvwWn0\\\/]|[(){}\[\]\$\^|\-*+?\.]|c[A-Z]|x[0-9a-fA-F]{2}|u[0-9a-fA-F]{4})/ +``` + +**Impact:** Regex escape sequences like `\(` or `\[` inside regex literals will only be recognized if followed by a space. Without a trailing space, these will fall through to the `regexp.invalid` rule (`/\\./`), producing incorrect token types. This affects regex highlighting accuracy. + +**Fix:** Remove the space before `|c[A-Z]` on line 183. + +--- + +### I-02: Misleading comment about `@name` resolution — **Severity: LOW (Cosmetic)** + +**File:** `src/documentdb/shell/highlighting/monarchRules.ts` lines 190–191 + +The comment says: +``` +// Inline regex references like `@digits` are resolved in the executor +// by replacing `@name` with the source of the corresponding pattern. +``` + +This contradicts the implementation — the DEVIATION note in the plan explicitly states that patterns were **inlined directly** and the executor does NOT perform `@name` resolution. This comment is leftover text that was not updated after the deviation was adopted. + +**Fix:** Update the comment to reflect reality, e.g.: +``` +// All @name references have been inlined directly in the regex patterns below. +// The executor does not need to resolve @name references. +``` + +--- + +### I-03: Module-level mutable cache in `monarchRunner.ts` — **Severity: MEDIUM (Design)** + +**File:** `src/documentdb/shell/highlighting/monarchRunner.ts` lines 29–30 + +```typescript +let cachedInput: string | undefined; +let cachedResult: TokenSpan[] | undefined; +``` + +The tokenizer cache is module-level global state. This creates: +1. **Test interference risk** — Tests sharing the same module instance may get cached results from previous test runs. The caching tests verify this intentionally, but it means test execution order matters. A test calling `tokenize('const x = 1', rules)` early would affect later tests calling the same input. +2. **Multi-shell scenario** — If two shell terminals are open, they share the same cache. Since only one character is cached, this causes unnecessary re-tokenization when switching between shells (cache thrashing). This is not a correctness issue, just a performance miss. + +The plan explicitly specifies this caching approach, so it's by design. However, encapsulating the cache in a class or closure would be more robust. + +**Recommendation:** Consider moving the cache into the `ShellInputHandler` or creating a `Tokenizer` class instance per shell. Not blocking for this PR. + +--- + +### I-04: `input_indexOf` uses snake_case naming — **Severity: LOW (Convention)** + +**File:** `src/documentdb/shell/highlighting/monarchRunner.ts` line 237 + +```typescript +function input_indexOf(haystack: string, needle: string, fromIndex: number): number { +``` + +TypeScript convention (and the project's ESLint config) uses camelCase for function names. This should be `inputIndexOf`. + +**Fix:** Rename to `inputIndexOf`. + +--- + +### I-05: Linear scan for keyword/operator lookup — **Severity: LOW (Performance)** + +**File:** `src/documentdb/shell/highlighting/monarchRunner.ts` line 195 + +```typescript +if (Array.isArray(array) && (array as string[]).includes(matchedText)) { +``` + +The `resolveCases` function performs `Array.includes()` which is O(n) on every identifier match. With 47 keywords, 10 BSON constructors, and 8 shell commands, this is fine for typical shell input (< 200 chars), well within the 0.5ms target. However, converting these arrays to `Set` objects (lazily, on first use) would be more idiomatic and future-proof. + +**Recommendation:** Consider converting to `Set` in a follow-up. Not blocking. + +--- + +### I-06: `shellHighlighter.test.ts` location differs from plan — **Severity: LOW (Convention)** + +**File:** `src/documentdb/shell/shellHighlighter.test.ts` + +The plan's file structure specifies: +``` +src/documentdb/shell/highlighting/ +└── shellHighlighter.test.ts # WI-4 integrated tests +``` + +But the file is actually at `src/documentdb/shell/shellHighlighter.test.ts` (one directory up, alongside `ShellInputHandler.ts`). This placement is arguably better since the integration tests exercise `ShellInputHandler` + highlighting together, but it's a deviation from the plan. + +**Recommendation:** Acceptable as-is. The test location makes sense given it tests integration between `ShellInputHandler` and the highlighting pipeline. + +--- + +### I-07: `colorizeShellInput.ts` not in original plan — **Severity: INFO (Deviation)** + +**File:** `src/documentdb/shell/highlighting/colorizeShellInput.ts` + +The plan specifies importing `tokenize`, `colorizeInput`, and `shellLanguageRules` directly in `DocumentDBShellPty` (WI-4d). The implementation instead creates a convenience wrapper (`colorizeShellInput`) and imports just that. This is a cleaner separation of concerns — the PTY doesn't need to know about the tokenizer/colorizer internals. + +**Assessment:** Positive deviation. No action needed. + +--- + +### I-08: `resolveCases` relies on `Object.entries` iteration order — **Severity: LOW (Correctness)** + +**File:** `src/documentdb/shell/highlighting/monarchRunner.ts` lines 186–206 + +The `resolveCases` function iterates over `Object.entries(cases)` and returns the first matching array. This means the order of keys in the `actionCases` object determines priority (e.g., `@shellCommands` before `@keywords`). In JavaScript, `Object.entries` preserves insertion order for string keys, so this works correctly. However, it's an implicit contract — reordering keys in `monarchRules.ts` would change behavior silently. + +The plan specifies this ordering requirement: "Add a `cases` branch in the lowercase identifier rule so `@shellCommands -> 'shell.command'` is checked **before** `@keywords`." The implementation correctly places `@shellCommands` first. + +**Recommendation:** Add a brief comment in the `actionCases` objects noting that key order matters. Not blocking. + +--- + +### I-09: No `l10n` strings added — **Severity: INFO (Verification)** + +No user-facing strings were added in this PR. All new code is internal (tokenizer, colorizer, ANSI sequences). The `colorize` callback and `reRenderLine()` produce terminal escape sequences, not localized messages. The `l10n` step from the PR checklist is satisfied (no new strings to localize). + +--- + +### I-10: Wrapped input lines are not actually supported by `reRenderLine()` — **Severity: HIGH (Bug)** + +**Files:** `src/documentdb/shell/ShellInputHandler.ts` (`reRenderLine()`), `src/documentdb/shell/DocumentDBShellPty.ts` + +The full-line re-render strategy assumes the editable input always fits on a single terminal row. `reRenderLine()` resets with `\r` and moves right by `_promptWidth`, but it never accounts for the terminal's column count or the number of wrapped rows already occupied by `prompt + buffer`. Once the user types past the terminal width, the next re-render starts at the beginning of the **current wrapped row** rather than the original prompt line. + +**Impact:** Visual corruption and cursor drift on long queries. This directly undermines manual test **T-17** in the plan (`No wrapping artifacts; cursor tracks correctly at end`) and will be visible on longer `find()` / `aggregate()` expressions. + +**Fix:** Make the renderer wrap-aware (track terminal columns and move up before rewriting), or explicitly scope the feature to single-row input until that support exists. + +--- + +### I-11: Cursor math uses `String.length` instead of terminal display width — **Severity: HIGH (Correctness)** + +**Files:** `src/documentdb/shell/ShellInputHandler.ts`, `src/documentdb/shell/DocumentDBShellPty.ts` + +Both prompt-width and cursor-offset calculations use JavaScript string length (`prompt.length`, `_buffer.length - _cursor`, etc.). That is not the same as **terminal display width** for emoji, CJK, and combining characters. The codebase already recognizes this in `ShellGhostText.ts`, which implements `terminalDisplayWidth()` for exactly this reason, but the new highlighting path does not reuse that logic. + +**Impact:** Prompts like `数据库> ` or inputs containing emoji / non-ASCII text will misplace the cursor, erase the wrong columns, or overwrite part of the prompt. This is a correctness bug, not just a cosmetic issue. + +**Fix:** Reuse the same display-width calculation used by `ShellGhostText` anywhere the renderer computes cursor movement or prompt width. + +--- + +### I-12: Completion-initiated edits bypass the highlighting path — **Severity: MEDIUM (UX / Requirement Gap)** + +**Files:** `src/documentdb/shell/ShellInputHandler.ts` (`insertText()`, `replaceText()`), `src/documentdb/shell/DocumentDBShellPty.ts` (`rewriteCurrentLine()`) + +The new highlighting architecture correctly re-renders after typed edits (`insertCharacter`, `handleBackspace`, `handleDelete`, etc.), but PTY-controlled buffer mutations still write raw text directly to the terminal. This affects: +- accepting ghost text, +- single-candidate Tab completion, +- replacement completions (quoted field paths / bracket notation), +- prompt rewrite after showing the completion list. + +Because these flows bypass `reRenderLine()` / `colorize`, the line can temporarily lose highlighting or show newly inserted text uncolored until the user types again. + +**Impact:** This conflicts with WI-4's stated goal that highlighting should apply on **every buffer mutation**. + +**Fix:** Route these PTY-controlled mutations through the same re-render/colorize path, or at minimum add dedicated tests and document the intended exception. + +--- + +### I-13: Key plan scenarios are still untested — **Severity: MEDIUM (Coverage Gap)** + +**Files:** `src/documentdb/shell/shellHighlighter.test.ts`, `src/documentdb/shell/ShellInputHandler.test.ts`, `src/documentdb/shell/DocumentDBShellPty.test.ts` + +The automated tests cover the happy path well, but several of the plan's most failure-prone scenarios are still missing from test coverage: +- long wrapped lines from **T-17**, +- Unicode / wide-character prompts or input, +- Tab completion and ghost-text acceptance with highlighting still intact, +- completion-list redraw preserving colorized input. + +**Impact:** The existing suite can pass cleanly while these user-visible regressions remain undetected. + +**Recommendation:** Add contract-style tests for wrapped lines and completion redraw before calling the feature fully production-ready. + +--- + +## Copilot Reviewer Comments + +Copilot submitted a review with **4 comments** (generated 2026-04-15). Below each is mapped to the corresponding agent finding (or listed as new). + +### C-01: Cache ignores `rules` parameter (×2 duplicate threads) + +**Thread IDs:** `PRRT_kwDOODtcO857GrPs`, `PRRT_kwDOODtcO857GrQL` +**File:** `src/documentdb/shell/highlighting/monarchRunner.ts` +**Links:** [Thread 1](https://github.com/microsoft/vscode-documentdb/pull/580#discussion_r2494398444), [Thread 2](https://github.com/microsoft/vscode-documentdb/pull/580#discussion_r2494398476) + +> "The memoization key ignores the `rules` parameter, so calling `tokenize()` with the same input but different `rules` will incorrectly return a cached result from the previous ruleset. Fix by including rules identity in the cache (e.g., store `cachedRules` and require `rules === cachedRules` for a cache hit, or use a `WeakMap`)." + +**Mapping:** Related to **I-03** (module-level mutable cache). The agent review noted the global cache design as a concern. Copilot specifically flags the missing `rules` identity check. + +**Assessment:** In practice, `tokenize()` is only ever called with `shellLanguageRules` (the single exported instance from `monarchRules.ts`). The `colorizeShellInput.ts` wrapper hard-codes it, and no other callers exist. However, the function signature accepts `rules` as a parameter, creating an API contract that the cache violates. **Severity: LOW in practice, MEDIUM as an API correctness concern.** A simple `cachedRules` reference check (`rules === cachedRules`) is a one-line fix with no performance cost. + +--- + +### C-02: `regexpesc` spurious space + +**Thread ID:** `PRRT_kwDOODtcO857GrQk` +**File:** `src/documentdb/shell/highlighting/monarchRules.ts` +**Link:** [Thread](https://github.com/microsoft/vscode-documentdb/pull/580#discussion_r2494398500) + +> "The `regexpesc` pattern contains an extra literal space after the character class (`[...\\.] |c[A-Z]...`). As written, it requires a space after that escaped character, which will cause valid regex escapes to be mis-tokenized. Remove the stray space so the alternation is `[...]|c[A-Z]|...`." + +Copilot provides a suggested fix: +```typescript +const regexpesc = /\\(?:[bBdDfnrstvwWn0\\\/]|[(){}\[\]\$\^|\-*+?\.]|c[A-Z]|x[0-9a-fA-F]{2}|u[0-9a-fA-F]{4})/; +``` + +**Mapping:** Identical to **I-01** in the agent review. Both reviews independently flagged the same bug. + +--- + +### C-03: `setPromptWidth` must be called for every prompt + +**Thread ID:** `PRRT_kwDOODtcO857GrQ6` +**File:** `src/documentdb/shell/DocumentDBShellPty.ts` +**Link:** [Thread](https://github.com/microsoft/vscode-documentdb/pull/580#discussion_r2494398522) + +> "With `ShellInputHandler` now using `\r` + `promptWidth` for every `reRenderLine()`, the PTY must ensure `setPromptWidth()` is called whenever the prompt (and continuation prompt) is written, otherwise re-renders will return to column 0 and risk overwriting the prompt area. If the prompt includes any ANSI styling, compute `promptWidth` using the visible (non-ANSI) width." + +**Mapping:** This was not explicitly flagged as an issue in the agent review, but it was **verified as correctly implemented** — `showPrompt()` calls `setPromptWidth(prompt.length)` at line 503 and `showContinuationPrompt()` calls it at line 525. Both prompts are plain text without ANSI codes, so `.length` equals the visible width. No action needed — the implementation already satisfies this concern. + +**Assessment:** Non-issue (already handled correctly). Copilot's concern is valid as a general principle but the code already does the right thing. + +--- + +## Summary Table + +| ID | Severity | File | Issue | Action | +|----|----------|------|-------|--------| +| I-01 / C-02 | **HIGH** | `monarchRules.ts:183` | Spurious space in `regexpesc` regex breaks regex escape matching | ✅ Fixed (`eba9286`) | +| I-02 | LOW | `monarchRules.ts:190-191` | Misleading comment about `@name` resolution (contradicts deviation) | ✅ Fixed (`cad3ae1`) | +| I-03 / C-01 | MEDIUM | `monarchRunner.ts:29-30` | Module-level cache ignores `rules` param; global state concerns | ✅ Fixed (`af0a427`) | +| I-04 | LOW | `monarchRunner.ts:237` | `input_indexOf` uses snake_case (should be camelCase) | ✅ Fixed (`6c35419`) | +| I-05 | LOW | `monarchRunner.ts:195` | Linear array scan for keyword lookup (O(n) per match) | Deferred (not blocking) | +| I-06 | LOW | `shellHighlighter.test.ts` | Test file location differs from plan (one dir up) | Accept as-is | +| I-07 | INFO | `colorizeShellInput.ts` | Extra convenience wrapper not in plan | Accept (positive deviation) | +| I-08 | LOW | `monarchRunner.ts:186-206` | `resolveCases` relies on implicit key insertion order | ✅ Fixed (`50c0173`) | +| I-09 | INFO | — | No l10n strings needed | Verified | +| I-10 | **HIGH** | `ShellInputHandler.ts` / `DocumentDBShellPty.ts` | Full-line re-render assumes a single terminal row; wrapped input is not handled | ✅ Fixed (`6c2e7e4`) | +| I-11 | **HIGH** | `ShellInputHandler.ts` / `DocumentDBShellPty.ts` | Cursor math uses `String.length` instead of terminal display width | ✅ Fixed (`6c2e7e4`) | +| I-12 | MEDIUM | `ShellInputHandler.ts` / `DocumentDBShellPty.ts` | Completion and ghost-text insertions bypass the colorized re-render path | ✅ Fixed (`63c8f4f`) | +| I-13 | MEDIUM | shell highlighting tests | Missing coverage for wrapped lines, Unicode width, and completion redraw | Deferred (separate PR) | +| C-03 | INFO | `DocumentDBShellPty.ts` | `setPromptWidth` must be called for every prompt | Already handled correctly | + +--- + +## Recommendation + +**Address before merge:** +1. **I-01 / C-02** — ✅ Fixed in `eba9286` — Removed the spurious space in `regexpesc` +2. **I-03 / C-01** — ✅ Fixed in `af0a427` — Added `cachedRules` identity check to tokenizer cache +3. **I-10** — ✅ Fixed in `6c2e7e4` — `reRenderLine()` is now wrap-aware (moves cursor up to prompt row, uses `\x1b[J`, computes row/column offsets) +4. **I-11** — ✅ Fixed in `6c2e7e4` — Cursor math uses `terminalDisplayWidth()` (extracted to shared module) instead of `String.length` + +**Strongly consider before merge:** +5. **I-12** — ✅ Fixed in `63c8f4f` — `insertText()`, `replaceText()`, and `rewriteCurrentLine()` now route through `reRenderLine()` / `renderCurrentLine()` so highlighting applies on every buffer mutation +6. **I-13** — Deferred to a separate PR. Additional regression tests for wrapped lines, Unicode width, and completion redraw. + +--- + +## Fix Log + +All fixes applied 2026-04-15. 398 shell tests pass (12 suites). Prettier and lint clean. + +| Commit | Issue(s) | Summary | +|--------|----------|---------| +| `eba9286` | I-01, C-02 | Removed spurious space in `regexpesc` regex pattern. The second alternative `[...] |c[A-Z]` required a trailing space after escaped regex control characters, causing valid escapes like `\(` or `\[` to mis-tokenize. | +| `cad3ae1` | I-02 | Updated misleading comment that stated `@name` references are resolved by the executor. All references were inlined directly per the documented deviation. | +| `af0a427` | I-03, C-01 | Added `cachedRules` reference check to the tokenizer cache. The cache now requires both `input === cachedInput` and `rules === cachedRules` for a hit, honoring the API contract. | +| `6c35419` | I-04 | Renamed `input_indexOf` → `inputIndexOf` (camelCase per TypeScript convention). | +| `50c0173` | I-08 | Added comment documenting that key order in `actionCases` objects determines match priority (Object.entries insertion order). | +| `6c2e7e4` | I-10, I-11 | Made `reRenderLine()` wrap-aware: cursor moves up to prompt row before `\r`, uses `\x1b[J` instead of `\x1b[K`, computes row/column cursor repositioning. Extracted `terminalDisplayWidth()` from `ShellGhostText.ts` to shared `terminalDisplayWidth.ts` module. All cursor math now uses display width instead of `String.length`. Added `setColumns()` to `ShellInputHandler`, wired from `DocumentDBShellPty`. | +| `63c8f4f` | I-12 | `insertText()` and `replaceText()` now call `reRenderLine()` instead of manual ANSI echo. `rewriteCurrentLine()` delegates to `renderCurrentLine()` using the colorize callback. Syntax highlighting now applies on every buffer mutation including completions and ghost text. | diff --git a/src/documentdb/shell/DocumentDBShellPty.ts b/src/documentdb/shell/DocumentDBShellPty.ts index f9f21629a..1bec663cb 100644 --- a/src/documentdb/shell/DocumentDBShellPty.ts +++ b/src/documentdb/shell/DocumentDBShellPty.ts @@ -9,6 +9,7 @@ import { ext } from '../../extensionVariables'; import { deserializeResultForSchema, feedResultToSchemaStore } from '../feedResultToSchemaStore'; import { type SerializableExecutionResult } from '../playground/workerTypes'; import { SchemaStore } from '../SchemaStore'; +import { colorizeShellInput } from './highlighting/colorizeShellInput'; import { SettingsHintError } from './SettingsHintError'; import { type CompletionResult, ShellCompletionProvider } from './ShellCompletionProvider'; import { findCommonPrefix, renderCompletionList } from './ShellCompletionRenderer'; @@ -132,6 +133,12 @@ export class DocumentDBShellPty implements vscode.Pseudoterminal { onLine: (line: string) => void this.handleLineInput(line), onInterrupt: () => this.handleInterrupt(), onContinuation: () => this.showContinuationPrompt(), + colorize: (input: string) => { + if (!this.isColorEnabled()) { + return input; + } + return colorizeShellInput(input); + }, onTab: (buffer: string, cursor: number) => this.handleTab(buffer, cursor), onBufferChange: (buffer: string, cursor: number) => this.handleBufferChange(buffer, cursor), onAcceptGhostText: () => this.handleAcceptGhostText(), @@ -141,9 +148,10 @@ export class DocumentDBShellPty implements vscode.Pseudoterminal { // ─── Pseudoterminal interface ──────────────────────────────────────────── open(initialDimensions: vscode.TerminalDimensions | undefined): void { - // Track terminal width for completion rendering + // Track terminal width for completion rendering and wrap-aware re-rendering if (initialDimensions) { this._columns = initialDimensions.columns; + this._inputHandler.setColumns(initialDimensions.columns); } // Disable input during initialization to prevent race conditions @@ -222,6 +230,7 @@ export class DocumentDBShellPty implements vscode.Pseudoterminal { */ setDimensions(dimensions: vscode.TerminalDimensions): void { this._columns = dimensions.columns; + this._inputHandler.setColumns(dimensions.columns); } // ─── Private: Session initialization ───────────────────────────────────── @@ -701,18 +710,13 @@ export class DocumentDBShellPty implements vscode.Pseudoterminal { /** * Rewrite the prompt and current buffer after showing a completion list. + * Uses the colorize callback (via renderCurrentLine) so highlighting is preserved. */ private rewriteCurrentLine(): void { const prompt = `${this._currentDatabase}> `; - const buffer = this._inputHandler.getBuffer(); - const cursor = this._inputHandler.getCursor(); - this._writeEmitter.fire(prompt + buffer); - - // Position cursor at the correct location - const trailingChars = buffer.length - cursor; - if (trailingChars > 0) { - this._writeEmitter.fire(`\x1b[${String(trailingChars)}D`); - } + this._inputHandler.setPromptWidth(prompt.length); + this._writeEmitter.fire(prompt); + this._inputHandler.renderCurrentLine(); } // ─── Private: Ghost text ───────────────────────────────────────────────── diff --git a/src/documentdb/shell/ShellGhostText.ts b/src/documentdb/shell/ShellGhostText.ts index e325d537b..60d28e4ca 100644 --- a/src/documentdb/shell/ShellGhostText.ts +++ b/src/documentdb/shell/ShellGhostText.ts @@ -15,6 +15,8 @@ * continues typing at the same location. */ +import { terminalDisplayWidth } from './terminalDisplayWidth'; + // ─── ANSI constants ────────────────────────────────────────────────────────── /** Dim + gray for ghost text appearance. */ @@ -23,62 +25,6 @@ const ANSI_RESET = '\x1b[0m'; /** Erase from cursor to end of line. */ const ERASE_TO_EOL = '\x1b[K'; -/** - * Compute the display-column width of a string for the terminal. - * - * JavaScript's `String.length` counts UTF-16 code units, but ANSI cursor - * movement operates on display columns. Surrogate pairs (emoji, symbols - * above U+FFFF) are 2 code units but typically 1–2 terminal columns. - * - * This uses `Intl.Segmenter` (available in Node 16+) to iterate grapheme - * clusters and counts each one as 1 column unless it is a known - * full-width/wide character (CJK Unified Ideographs, etc.). - */ -function terminalDisplayWidth(text: string): number { - // Fast path: ASCII-only strings (common case) - if (/^[\x20-\x7e]*$/.test(text)) { - return text.length; - } - - let width = 0; - // Use Intl.Segmenter to properly iterate grapheme clusters - // This handles surrogate pairs, combining marks, ZWJ sequences, etc. - const segmenter = new Intl.Segmenter(undefined, { granularity: 'grapheme' }); - for (const { segment } of segmenter.segment(text)) { - const cp = segment.codePointAt(0) ?? 0; - // Full-width / wide characters occupy 2 columns - if (isWideCharacter(cp)) { - width += 2; - } else { - width += 1; - } - } - - return width; -} - -/** - * Returns true for code points that occupy 2 terminal columns. - * Covers CJK Unified Ideographs and common full-width ranges. - */ -function isWideCharacter(cp: number): boolean { - return ( - (cp >= 0x1100 && cp <= 0x115f) || // Hangul Jamo - (cp >= 0x2e80 && cp <= 0x303e) || // CJK Radicals, Kangxi, CJK Symbols - (cp >= 0x3040 && cp <= 0x33bf) || // Hiragana, Katakana, Bopomofo, etc. - (cp >= 0x3400 && cp <= 0x4dbf) || // CJK Unified Ideographs Extension A - (cp >= 0x4e00 && cp <= 0xa4cf) || // CJK Unified Ideographs + Yi - (cp >= 0xa960 && cp <= 0xa97c) || // Hangul Jamo Extended-A - (cp >= 0xac00 && cp <= 0xd7a3) || // Hangul Syllables - (cp >= 0xf900 && cp <= 0xfaff) || // CJK Compatibility Ideographs - (cp >= 0xfe30 && cp <= 0xfe6f) || // CJK Compatibility Forms - (cp >= 0xff01 && cp <= 0xff60) || // Fullwidth Forms - (cp >= 0xffe0 && cp <= 0xffe6) || // Fullwidth Signs - (cp >= 0x20000 && cp <= 0x2fffd) || // CJK Unified Ideographs Extension B+ - (cp >= 0x30000 && cp <= 0x3fffd) // CJK Unified Ideographs Extension G+ - ); -} - /** * Manages the lifecycle of ghost text in the terminal. * diff --git a/src/documentdb/shell/ShellInputHandler.test.ts b/src/documentdb/shell/ShellInputHandler.test.ts index 47b1781ad..010c01cc6 100644 --- a/src/documentdb/shell/ShellInputHandler.test.ts +++ b/src/documentdb/shell/ShellInputHandler.test.ts @@ -40,8 +40,9 @@ describe('ShellInputHandler', () => { describe('basic character input', () => { it('should echo printable characters', () => { handler.handleInput('a'); - expect(written).toBe('a'); expect(handler.getBuffer()).toBe('a'); + // The re-render output contains ANSI positioning + the character + expect(written).toContain('a'); }); it('should accumulate characters in buffer', () => { @@ -56,7 +57,8 @@ describe('ShellInputHandler', () => { it('should handle multi-character input at once', () => { handler.handleInput('hello'); expect(handler.getBuffer()).toBe('hello'); - expect(written).toBe('hello'); + // The re-render output contains ANSI positioning + the full text + expect(written).toContain('hello'); }); it('should ignore control characters below space (except special ones)', () => { diff --git a/src/documentdb/shell/ShellInputHandler.ts b/src/documentdb/shell/ShellInputHandler.ts index d2c0d60f6..3f2b43f19 100644 --- a/src/documentdb/shell/ShellInputHandler.ts +++ b/src/documentdb/shell/ShellInputHandler.ts @@ -12,12 +12,10 @@ */ import { isExpressionIncomplete } from './bracketDepthCounter'; +import { terminalDisplayWidth } from './terminalDisplayWidth'; // ─── ANSI constants ────────────────────────────────────────────────────────── -/** Erase from cursor to end of line */ -const ERASE_TO_EOL = '\x1b[K'; - /** * Callbacks for the ShellInputHandler to communicate with the Pseudoterminal. */ @@ -30,6 +28,8 @@ export interface ShellInputHandlerCallbacks { onInterrupt: () => void; /** Called when multi-line continuation is needed (PTY shows a continuation prompt). */ onContinuation: () => void; + /** Optional: colorize the input buffer for syntax highlighting. */ + colorize?: (input: string) => string; /** Called when the user presses Tab — the PTY handles completion logic. */ onTab?: (buffer: string, cursor: number) => void; /** Called after any buffer/cursor change — the PTY uses this for ghost text. */ @@ -71,6 +71,12 @@ export class ShellInputHandler { /** Whether we are in the middle of reading an escape sequence. */ private _inEscape: boolean = false; + /** Width of the prompt string in characters (for cursor repositioning). */ + private _promptWidth: number = 0; + + /** Terminal width in columns (for wrap-aware re-rendering). */ + private _columns: number = 80; + private readonly _callbacks: ShellInputHandlerCallbacks; constructor(callbacks: ShellInputHandlerCallbacks) { @@ -79,10 +85,16 @@ export class ShellInputHandler { /** * Set the prompt width so cursor positioning accounts for it. - * Reserved for future multi-line wrapping support. */ - setPromptWidth(_width: number): void { - // Reserved for future multi-line input support + setPromptWidth(width: number): void { + this._promptWidth = width; + } + + /** + * Set the terminal width in columns (for wrap-aware re-rendering). + */ + setColumns(columns: number): void { + this._columns = columns; } /** @@ -143,6 +155,14 @@ export class ShellInputHandler { return this._cursor; } + /** + * Force a re-render of the current line (used after PTY-controlled mutations + * like rewriting the prompt after a completion list is shown). + */ + renderCurrentLine(): void { + this.reRenderLine(); + } + /** * Insert text at the current cursor position and update the display. * Used by the PTY to insert accepted completions or ghost text. @@ -156,14 +176,7 @@ export class ShellInputHandler { const after = this._buffer.slice(this._cursor); this._buffer = before + text + after; this._cursor += text.length; - - if (after.length > 0) { - // Insert mode: write text + rest of line, move cursor back - this._callbacks.write(text + after + '\b'.repeat(after.length)); - } else { - // Append mode: just echo the text - this._callbacks.write(text); - } + this.reRenderLine(); } /** @@ -183,27 +196,8 @@ export class ShellInputHandler { const before = this._buffer.slice(0, this._cursor - deleteCount); const after = this._buffer.slice(this._cursor); this._buffer = before + text + after; - - // Move cursor back to start of replaced region - if (deleteCount > 0) { - this._callbacks.write(`\x1b[${String(deleteCount)}D`); - } - - // Write new text + remainder of line - this._callbacks.write(text + after); - - // Erase leftover characters if replacement is shorter than deleted text - const cleanup = Math.max(0, deleteCount - text.length); - if (cleanup > 0) { - this._callbacks.write(' '.repeat(cleanup) + '\b'.repeat(cleanup)); - } - - // Move cursor back to end of inserted text (before 'after' portion) - if (after.length > 0) { - this._callbacks.write('\b'.repeat(after.length)); - } - this._cursor = before.length + text.length; + this.reRenderLine(); } /** @@ -361,29 +355,16 @@ export class ShellInputHandler { return; } - const before = this._buffer.slice(0, this._cursor - 1); - const after = this._buffer.slice(this._cursor); - this._buffer = before + after; + this._buffer = this._buffer.slice(0, this._cursor - 1) + this._buffer.slice(this._cursor); this._cursor--; - - // Move cursor back one, rewrite remainder, erase trailing char - this._callbacks.write('\b' + after + ' ' + '\b'.repeat(after.length + 1)); + this.reRenderLine(); this._callbacks.onBufferChange?.(this._buffer, this._cursor); } private insertCharacter(ch: string): void { - const before = this._buffer.slice(0, this._cursor); - const after = this._buffer.slice(this._cursor); - this._buffer = before + ch + after; + this._buffer = this._buffer.slice(0, this._cursor) + ch + this._buffer.slice(this._cursor); this._cursor++; - - if (after.length > 0) { - // Insert mode: write char + rest of line, move cursor back - this._callbacks.write(ch + after + '\b'.repeat(after.length)); - } else { - // Append mode: just echo the character - this._callbacks.write(ch); - } + this.reRenderLine(); } // ─── Private: escape sequence handling ─────────────────────────────────── @@ -528,12 +509,8 @@ export class ShellInputHandler { return; } - const before = this._buffer.slice(0, this._cursor); - const after = this._buffer.slice(this._cursor + 1); - this._buffer = before + after; - - // Rewrite remainder + erase trailing char - this._callbacks.write(after + ' ' + '\b'.repeat(after.length + 1)); + this._buffer = this._buffer.slice(0, this._cursor) + this._buffer.slice(this._cursor + 1); + this.reRenderLine(); this._callbacks.onBufferChange?.(this._buffer, this._cursor); } @@ -542,21 +519,15 @@ export class ShellInputHandler { return; } - const after = this._buffer.slice(this._cursor); - const eraseCount = this._cursor; - this._buffer = after; + this._buffer = this._buffer.slice(this._cursor); this._cursor = 0; - - // Move cursor to start of input, rewrite remaining text, erase old chars - this._callbacks.write( - `\x1b[${String(eraseCount)}D` + after + ' '.repeat(eraseCount) + '\b'.repeat(after.length + eraseCount), - ); + this.reRenderLine(); this._callbacks.onBufferChange?.(this._buffer, this._cursor); } private clearAfterCursor(): void { this._buffer = this._buffer.slice(0, this._cursor); - this._callbacks.write(ERASE_TO_EOL); + this.reRenderLine(); this._callbacks.onBufferChange?.(this._buffer, this._cursor); } @@ -575,15 +546,9 @@ export class ShellInputHandler { pos--; } - const deleted = this._cursor - pos; - const after = this._buffer.slice(this._cursor); - this._buffer = this._buffer.slice(0, pos) + after; - - // Move left, rewrite remainder, erase trailing - this._callbacks.write( - `\x1b[${String(deleted)}D` + after + ' '.repeat(deleted) + '\b'.repeat(after.length + deleted), - ); + this._buffer = this._buffer.slice(0, pos) + this._buffer.slice(this._cursor); this._cursor = pos; + this.reRenderLine(); this._callbacks.onBufferChange?.(this._buffer, this._cursor); } @@ -631,16 +596,81 @@ export class ShellInputHandler { // Flatten multi-line history entries for single-line display const displayText = newText.replace(/\n/g, ' '); - // Move cursor to start of input - if (this._cursor > 0) { - this._callbacks.write(`\x1b[${String(this._cursor)}D`); + this._buffer = displayText; + this._cursor = displayText.length; + this.reRenderLine(); + } + + // ─── Private: line re-rendering ────────────────────────────────────────── + + /** + * Re-render the entire input line with syntax highlighting. + * + * This replaces the old per-character echo approach. On every buffer mutation: + * 1. Move cursor up to the prompt row if input wraps across multiple rows. + * 2. Move cursor to the start of the input area (after the prompt). + * 3. Write the (optionally colorized) buffer content. + * 4. Erase any leftover characters/rows from the previous (longer) buffer. + * 5. Reposition the cursor to the correct row and column. + */ + private reRenderLine(): void { + const bufferWidth = terminalDisplayWidth(this._buffer); + const cursorDisplayOffset = terminalDisplayWidth(this._buffer.slice(0, this._cursor)); + const cols = this._columns; + + let output = ''; + + // Step 1: Move cursor up to the prompt row if wrapping occurred + const cursorAbsCol = this._promptWidth + cursorDisplayOffset; + const cursorRow = cols > 0 ? Math.floor(cursorAbsCol / cols) : 0; + if (cursorRow > 0) { + output += `\x1b[${String(cursorRow)}A`; } - // Write new text and erase any leftover characters - const clearLen = Math.max(0, this._buffer.length - displayText.length); - this._callbacks.write(displayText + ' '.repeat(clearLen) + '\b'.repeat(clearLen)); + // Step 2: Carriage return + move right past the prompt + output += '\r'; + if (this._promptWidth > 0) { + output += `\x1b[${String(this._promptWidth)}C`; + } - this._buffer = displayText; - this._cursor = displayText.length; + // Step 3: Write the buffer content, optionally colorized + const displayText = this._callbacks.colorize ? this._callbacks.colorize(this._buffer) : this._buffer; + output += displayText; + + // Step 4: Erase from cursor to end of screen (handles wrapped leftover rows) + output += '\x1b[J'; + + // Step 5: Reposition cursor to the correct position + const endAbsCol = this._promptWidth + bufferWidth; + const targetAbsCol = this._promptWidth + cursorDisplayOffset; + + if (cols > 0 && endAbsCol !== targetAbsCol) { + const endRow = Math.floor(endAbsCol / cols); + const targetRow = Math.floor(targetAbsCol / cols); + const endCol = endAbsCol % cols; + const targetCol = targetAbsCol % cols; + + // Move up from end row to target row + const rowDiff = endRow - targetRow; + if (rowDiff > 0) { + output += `\x1b[${String(rowDiff)}A`; + } + + // Move horizontally to target column + const colDiff = endCol - targetCol; + if (colDiff > 0) { + output += `\x1b[${String(colDiff)}D`; + } else if (colDiff < 0) { + output += `\x1b[${String(-colDiff)}C`; + } + } else { + // Fallback for unknown columns: simple cursor-back + const tailWidth = terminalDisplayWidth(this._buffer.slice(this._cursor)); + if (tailWidth > 0) { + output += `\x1b[${String(tailWidth)}D`; + } + } + + this._callbacks.write(output); } } diff --git a/src/documentdb/shell/highlighting/colorizeShellInput.ts b/src/documentdb/shell/highlighting/colorizeShellInput.ts new file mode 100644 index 000000000..d2c07efa4 --- /dev/null +++ b/src/documentdb/shell/highlighting/colorizeShellInput.ts @@ -0,0 +1,24 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +/** + * Convenience function that combines the tokenizer and colorizer + * into a single call for use by the shell PTY. + */ + +import { shellLanguageRules } from './monarchRules'; +import { tokenize } from './monarchRunner'; +import { colorizeInput } from './tokenColorizer'; + +/** + * Tokenize and colorize a shell input string in one step. + * + * @param input - The raw input string from the shell line buffer. + * @returns The input with ANSI color codes for syntax highlighting. + */ +export function colorizeShellInput(input: string): string { + const tokens = tokenize(input, shellLanguageRules); + return colorizeInput(input, tokens); +} diff --git a/src/documentdb/shell/highlighting/monarchRules.ts b/src/documentdb/shell/highlighting/monarchRules.ts new file mode 100644 index 000000000..114ab0a5a --- /dev/null +++ b/src/documentdb/shell/highlighting/monarchRules.ts @@ -0,0 +1,361 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +/* eslint-disable no-useless-escape -- Regex patterns vendored from Monaco Editor; preserved as-is */ + +/** + * Vendored and extended Monarch tokenizer rules for the DocumentDB interactive shell. + * + * Based on the JavaScript variant of the Monaco Editor's Monarch tokenizer rules + * (monaco-editor/esm/vs/basic-languages/typescript/typescript.ts and + * monaco-editor/esm/vs/basic-languages/javascript/javascript.ts). + * + * The Monaco Editor is MIT-licensed: + * Copyright (c) Microsoft Corporation. All rights reserved. + * https://github.com/microsoft/monaco-editor/blob/main/LICENSE.txt + * + * Extended with DocumentDB-specific token categories: + * - BSON constructors (ObjectId, ISODate, etc.) + * - Shell commands (show, use, it, exit, etc.) + * - $-prefixed query/aggregation operators ($gt, $match, etc.) + */ + +// ─── Rule types ────────────────────────────────────────────────────────────── + +/** + * A single Monarch tokenizer rule. Each variant corresponds to a different + * action the state machine can take when a regex matches. + */ +export type MonarchRule = + | { regex: RegExp; action: string } // match → emit token + | { regex: RegExp; action: string; next: string } // match → emit token + push/pop state + | { regex: RegExp; actionCases: Record; next?: string } // match → lookup cases + optional next + | { regex: RegExp; actionByGroup: string[]; next?: string } // match → one token per group + optional next + | { include: string }; // include another state's rules + +/** + * The complete set of tokenizer rules for the shell language. + */ +export interface MonarchLanguageRules { + readonly keywords: readonly string[]; + readonly bsonConstructors: readonly string[]; + readonly shellCommands: readonly string[]; + readonly operators: readonly string[]; + readonly symbols: RegExp; + readonly escapes: RegExp; + readonly digits: RegExp; + readonly octaldigits: RegExp; + readonly binarydigits: RegExp; + readonly hexdigits: RegExp; + readonly regexpctl: RegExp; + readonly regexpesc: RegExp; + readonly tokenizer: Record; +} + +// ─── JavaScript keywords (from Monaco's JS language definition) ────────────── + +const keywords: readonly string[] = [ + 'break', + 'case', + 'catch', + 'class', + 'continue', + 'const', + 'constructor', + 'debugger', + 'default', + 'delete', + 'do', + 'else', + 'export', + 'extends', + 'false', + 'finally', + 'for', + 'from', + 'function', + 'get', + 'if', + 'import', + 'in', + 'instanceof', + 'let', + 'new', + 'null', + 'return', + 'set', + 'static', + 'super', + 'switch', + 'symbol', + 'this', + 'throw', + 'true', + 'try', + 'typeof', + 'undefined', + 'var', + 'void', + 'while', + 'with', + 'yield', + 'async', + 'await', + 'of', +]; + +// ─── DocumentDB extensions ─────────────────────────────────────────────────── + +const bsonConstructors: readonly string[] = [ + 'ObjectId', + 'ISODate', + 'NumberLong', + 'NumberInt', + 'NumberDecimal', + 'BinData', + 'UUID', + 'Timestamp', + 'MinKey', + 'MaxKey', +]; + +const shellCommands: readonly string[] = ['show', 'use', 'it', 'exit', 'quit', 'cls', 'clear', 'help']; + +// ─── Operators (from Monaco's TypeScript language definition) ───────────────── + +const operators: readonly string[] = [ + '<=', + '>=', + '==', + '!=', + '===', + '!==', + '=>', + '+', + '-', + '**', + '*', + '/', + '%', + '++', + '--', + '<<', + '>', + '>>>', + '&', + '|', + '^', + '!', + '~', + '&&', + '||', + '??', + '?', + ':', + '=', + '+=', + '-=', + '*=', + '**=', + '/=', + '%=', + '<<=', + '>>=', + '>>>=', + '&=', + '|=', + '^=', + '@', +]; + +// ─── Named regex patterns ──────────────────────────────────────────────────── + +const symbols = /[=> = { + root: [{ regex: /[{}]/, action: 'delimiter.bracket' }, { include: 'common' }], + + common: [ + // $-prefixed DocumentDB API operators — must come before general identifiers + { regex: /\$[a-zA-Z_]\w*/, action: 'documentdb.operator' }, + + // Lowercase identifiers and keywords + // NOTE: Key order matters — resolveCases checks keys in insertion order + // and returns the first match. shellCommands must be checked before keywords. + { + regex: /#?[a-z_$][\w$]*/, + actionCases: { + '@shellCommands': 'shell.command', + '@keywords': 'keyword', + '@default': 'identifier', + }, + }, + + // PascalCase identifiers — check for BSON constructors first + { + regex: /[A-Z][\w$]*/, + actionCases: { + '@bsonConstructors': 'bson.constructor', + '@default': 'type.identifier', + }, + }, + + // Whitespace and comments + { include: 'whitespace' }, + + // Regular expression literal — ensure it is terminated before beginning + // (otherwise it is an operator) + { + regex: /\/(?=([^\\\/]|\\.)+\/([dgimsuy]*)(\s*)(\.|;|,|\)|\]|\}|$))/, + action: 'regexp', + next: 'regexp', + }, + + // Delimiters and operators + { regex: /[()\[\]]/, action: '@brackets' }, + { regex: /[<>](?![=> { + return tokenize(input, shellLanguageRules).map((t) => [input.slice(t.start, t.end), t.type]); +} + +describe('MonarchRunner', () => { + describe('keywords', () => { + it('should tokenize "const" as a keyword', () => { + const pairs = tokenPairs('const x = 1'); + expect(pairs[0]).toEqual(['const', 'keyword']); + }); + + it('should tokenize "let" as a keyword', () => { + const pairs = tokenPairs('let y'); + expect(pairs[0]).toEqual(['let', 'keyword']); + }); + + it('should tokenize "function" as a keyword', () => { + const pairs = tokenPairs('function foo() {}'); + expect(pairs[0]).toEqual(['function', 'keyword']); + }); + + it('should tokenize "return" as a keyword', () => { + const pairs = tokenPairs('return true'); + expect(pairs[0]).toEqual(['return', 'keyword']); + expect(pairs[1]).toEqual(['true', 'keyword']); + }); + }); + + describe('strings', () => { + it('should tokenize a double-quoted string', () => { + const pairs = tokenPairs('"hello world"'); + // The quotes and content are tokenized as string tokens + const stringTokens = pairs.filter(([, type]) => type === 'string'); + expect(stringTokens.length).toBeGreaterThan(0); + // The full span should cover the entire input + const tokens = tokenize('"hello world"', shellLanguageRules); + const fullText = tokens.map((t) => '"hello world"'.slice(t.start, t.end)).join(''); + expect(fullText).toBe('"hello world"'); + }); + + it('should tokenize a single-quoted string', () => { + const tokens = tokenize("'hello'", shellLanguageRules); + const types = tokens.map((t) => t.type); + expect(types.every((t) => t === 'string' || t === 'string.escape')).toBe(true); + }); + + it('should tokenize an unterminated double-quoted string as string.invalid', () => { + const pairs = tokenPairs('"hello'); + expect(pairs.some(([, type]) => type === 'string.invalid')).toBe(true); + }); + + it('should tokenize an unterminated single-quoted string as string.invalid', () => { + const pairs = tokenPairs("'hello"); + expect(pairs.some(([, type]) => type === 'string.invalid')).toBe(true); + }); + }); + + describe('template literals', () => { + it('should tokenize a template literal', () => { + const input = '`hello ${name}`'; + const tokens = tokenize(input, shellLanguageRules); + const types = tokens.map((t) => t.type); + // Should contain string parts and delimiter.bracket for ${ and } + expect(types).toContain('string'); + expect(types).toContain('delimiter.bracket'); + expect(types).toContain('identifier'); + }); + }); + + describe('numbers', () => { + it('should tokenize an integer', () => { + const pairs = tokenPairs('42'); + expect(pairs[0]).toEqual(['42', 'number']); + }); + + it('should tokenize a float', () => { + const pairs = tokenPairs('3.14'); + expect(pairs[0]).toEqual(['3.14', 'number.float']); + }); + + it('should tokenize a hex number', () => { + const pairs = tokenPairs('0xFF'); + expect(pairs[0]).toEqual(['0xFF', 'number.hex']); + }); + + it('should tokenize an octal number', () => { + const pairs = tokenPairs('0o77'); + expect(pairs[0]).toEqual(['0o77', 'number.octal']); + }); + + it('should tokenize a binary number', () => { + const pairs = tokenPairs('0b1010'); + expect(pairs[0]).toEqual(['0b1010', 'number.binary']); + }); + + it('should tokenize a number with exponent', () => { + const pairs = tokenPairs('1e10'); + expect(pairs[0]).toEqual(['1e10', 'number.float']); + }); + }); + + describe('comments', () => { + it('should tokenize a line comment', () => { + const pairs = tokenPairs('// a comment'); + expect(pairs[0]).toEqual(['// a comment', 'comment']); + }); + + it('should tokenize a block comment', () => { + const pairs = tokenPairs('/* block */'); + // Block comment spans multiple tokens that get merged + const types = [...new Set(pairs.map(([, type]) => type))]; + expect(types).toEqual(['comment']); + }); + + it('should tokenize a JSDoc comment', () => { + const pairs = tokenPairs('/** doc */'); + const types = [...new Set(pairs.map(([, type]) => type))]; + expect(types).toEqual(['comment.doc']); + }); + }); + + describe('regex literals', () => { + it('should tokenize a regex literal', () => { + const input = '/^hello/i'; + const tokens = tokenize(input, shellLanguageRules); + const types = tokens.map((t) => t.type); + expect(types).toContain('regexp'); + }); + }); + + describe('BSON constructors', () => { + it('should tokenize ObjectId as bson.constructor', () => { + const pairs = tokenPairs('ObjectId("abc")'); + expect(pairs[0]).toEqual(['ObjectId', 'bson.constructor']); + }); + + it('should tokenize ISODate as bson.constructor', () => { + const pairs = tokenPairs('ISODate("2025-01-01")'); + expect(pairs[0]).toEqual(['ISODate', 'bson.constructor']); + }); + + it('should tokenize NumberLong as bson.constructor', () => { + const pairs = tokenPairs('NumberLong(42)'); + expect(pairs[0]).toEqual(['NumberLong', 'bson.constructor']); + }); + + it('should tokenize MinKey as bson.constructor', () => { + const pairs = tokenPairs('MinKey()'); + expect(pairs[0]).toEqual(['MinKey', 'bson.constructor']); + }); + + it('should tokenize MaxKey as bson.constructor', () => { + const pairs = tokenPairs('MaxKey()'); + expect(pairs[0]).toEqual(['MaxKey', 'bson.constructor']); + }); + + it('should not tokenize unknown PascalCase as bson.constructor', () => { + const pairs = tokenPairs('MyClass'); + expect(pairs[0]).toEqual(['MyClass', 'type.identifier']); + }); + }); + + describe('DocumentDB API operators', () => { + it('should tokenize $gt as documentdb.operator', () => { + const input = '{ $gt: 5 }'; + const tokens = tokenize(input, shellLanguageRules); + const op = tokens.find((t) => input.slice(t.start, t.end) === '$gt'); + expect(op).toBeDefined(); + expect(op!.type).toBe('documentdb.operator'); + }); + + it('should tokenize $match as documentdb.operator', () => { + const pairs = tokenPairs('$match'); + expect(pairs[0]).toEqual(['$match', 'documentdb.operator']); + }); + + it('should tokenize $lookup as documentdb.operator', () => { + const pairs = tokenPairs('$lookup'); + expect(pairs[0]).toEqual(['$lookup', 'documentdb.operator']); + }); + }); + + describe('shell commands', () => { + it('should tokenize "show" as shell.command', () => { + const pairs = tokenPairs('show dbs'); + expect(pairs[0]).toEqual(['show', 'shell.command']); + }); + + it('should tokenize "use" as shell.command', () => { + const pairs = tokenPairs('use mydb'); + expect(pairs[0]).toEqual(['use', 'shell.command']); + }); + + it('should tokenize "exit" as shell.command', () => { + const pairs = tokenPairs('exit'); + expect(pairs[0]).toEqual(['exit', 'shell.command']); + }); + + it('should tokenize "help" as shell.command', () => { + const pairs = tokenPairs('help'); + expect(pairs[0]).toEqual(['help', 'shell.command']); + }); + + it('should tokenize "it" as shell.command', () => { + const pairs = tokenPairs('it'); + expect(pairs[0]).toEqual(['it', 'shell.command']); + }); + }); + + describe('mixed expressions', () => { + it('should tokenize db.users.find({ name: "alice" })', () => { + const input = 'db.users.find({ name: "alice" })'; + const tokens = tokenize(input, shellLanguageRules); + + // "db" is an identifier + const db = tokens.find((t) => input.slice(t.start, t.end) === 'db'); + expect(db?.type).toBe('identifier'); + + // "users" is an identifier + const users = tokens.find((t) => input.slice(t.start, t.end) === 'users'); + expect(users?.type).toBe('identifier'); + + // "find" is an identifier + const find = tokens.find((t) => input.slice(t.start, t.end) === 'find'); + expect(find?.type).toBe('identifier'); + + // "name" is an identifier + const name = tokens.find((t) => input.slice(t.start, t.end) === 'name'); + expect(name?.type).toBe('identifier'); + + // The string content is tokenized as string + expect(tokens.some((t) => t.type === 'string')).toBe(true); + + // Brackets are proper delimiters + expect(tokens.some((t) => t.type === 'delimiter.bracket')).toBe(true); + }); + + it('should tokenize { $gt: 5 } with correct types', () => { + const input = '{ $gt: 5 }'; + const tokens = tokenize(input, shellLanguageRules); + const pairs = tokens.map((t) => [input.slice(t.start, t.end), t.type]); + + expect(pairs).toEqual([ + ['{', 'delimiter.bracket'], + ['$gt', 'documentdb.operator'], + [':', 'delimiter'], + ['5', 'number'], + ['}', 'delimiter.bracket'], + ]); + }); + }); + + describe('empty input', () => { + it('should return empty array for empty string', () => { + expect(tokenize('', shellLanguageRules)).toEqual([]); + }); + }); + + describe('caching', () => { + it('should return the same result for the same input', () => { + const result1 = tokenize('const x = 1', shellLanguageRules); + const result2 = tokenize('const x = 1', shellLanguageRules); + expect(result1).toBe(result2); // Same reference (cached) + }); + + it('should return different result for different input', () => { + const result1 = tokenize('const x = 1', shellLanguageRules); + const result2 = tokenize('let y = 2', shellLanguageRules); + expect(result1).not.toBe(result2); + }); + }); + + describe('identifiers', () => { + it('should tokenize regular identifiers', () => { + const pairs = tokenPairs('foo'); + expect(pairs[0]).toEqual(['foo', 'identifier']); + }); + + it('should tokenize private identifiers with #', () => { + const pairs = tokenPairs('#private'); + expect(pairs[0]).toEqual(['#private', 'identifier']); + }); + }); + + describe('delimiters', () => { + it('should tokenize semicolons as delimiter', () => { + const tokens = tokenize('a;b', shellLanguageRules); + const semi = tokens.find((t) => 'a;b'.slice(t.start, t.end) === ';'); + expect(semi?.type).toBe('delimiter'); + }); + + it('should tokenize dots as delimiter', () => { + const tokens = tokenize('a.b', shellLanguageRules); + const dot = tokens.find((t) => 'a.b'.slice(t.start, t.end) === '.'); + expect(dot?.type).toBe('delimiter'); + }); + }); + + describe('operators', () => { + it('should tokenize = as delimiter (operator)', () => { + const input = 'x = 1'; + const tokens = tokenize(input, shellLanguageRules); + const eq = tokens.find((t) => input.slice(t.start, t.end) === '='); + expect(eq?.type).toBe('delimiter'); + }); + + it('should tokenize === as delimiter (operator)', () => { + const input = 'x === 1'; + const tokens = tokenize(input, shellLanguageRules); + const eq = tokens.find((t) => input.slice(t.start, t.end) === '==='); + expect(eq?.type).toBe('delimiter'); + }); + }); + + describe('escape sequences in strings', () => { + it('should tokenize escape sequences in double-quoted strings', () => { + const input = '"hello\\nworld"'; + const tokens = tokenize(input, shellLanguageRules); + expect(tokens.some((t) => t.type === 'string.escape')).toBe(true); + }); + }); +}); diff --git a/src/documentdb/shell/highlighting/monarchRunner.ts b/src/documentdb/shell/highlighting/monarchRunner.ts new file mode 100644 index 000000000..0e183da7c --- /dev/null +++ b/src/documentdb/shell/highlighting/monarchRunner.ts @@ -0,0 +1,305 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +/** + * Lightweight Monarch state-machine executor. + * + * Runs the tokenizer rules from {@link MonarchLanguageRules} against a plain string + * and returns an array of token spans. No Monaco or DOM dependency. + */ + +import { type MonarchLanguageRules, type MonarchRule } from './monarchRules'; + +// ─── Public types ──────────────────────────────────────────────────────────── + +/** A span of text with its token type. */ +export interface TokenSpan { + /** Start offset (inclusive). */ + start: number; + /** End offset (exclusive). */ + end: number; + /** The Monarch token type, e.g. "keyword", "string", "bson.constructor". */ + type: string; +} + +// ─── Cached previous result ────────────────────────────────────────────────── + +let cachedInput: string | undefined; +let cachedResult: TokenSpan[] | undefined; +let cachedRules: MonarchLanguageRules | undefined; + +// ─── Public API ────────────────────────────────────────────────────────────── + +/** + * Tokenize an input string using the Monarch state machine. + * + * @param input - The string to tokenize (typically one line of shell input). + * @param rules - The Monarch language rules to apply. + * @returns An array of token spans covering the entire input. + */ +export function tokenize(input: string, rules: MonarchLanguageRules): TokenSpan[] { + if (input.length === 0) { + return []; + } + + // Memoize: return cached result if input and rules haven't changed (cursor-only movements) + if (input === cachedInput && rules === cachedRules && cachedResult !== undefined) { + return cachedResult; + } + + const result = runTokenizer(input, rules); + cachedInput = input; + cachedRules = rules; + cachedResult = result; + return result; +} + +// ─── State machine ─────────────────────────────────────────────────────────── + +const MAX_STACK_DEPTH = 32; + +function runTokenizer(input: string, rules: MonarchLanguageRules): TokenSpan[] { + const tokens: TokenSpan[] = []; + const stateStack: string[] = ['root']; + let pos = 0; + + while (pos < input.length) { + const currentState = stateStack[stateStack.length - 1]; + const stateRules = rules.tokenizer[currentState]; + + if (!stateRules) { + // Unknown state — consume one character as invalid + tokens.push({ start: pos, end: pos + 1, type: 'invalid' }); + pos++; + continue; + } + + const matched = tryMatchRules(input, pos, stateRules, stateStack, rules, tokens); + + if (!matched) { + // No rule matched — consume one character as invalid to prevent infinite loops + tokens.push({ start: pos, end: pos + 1, type: 'invalid' }); + pos++; + } else { + pos = matched; + } + } + + return mergeAdjacentTokens(tokens); +} + +/** + * Try each rule in the given state's rule list. If a rule matches, emit tokens + * and return the new position. If no rule matches, return 0. + */ +function tryMatchRules( + input: string, + pos: number, + stateRules: MonarchRule[], + stateStack: string[], + rules: MonarchLanguageRules, + tokens: TokenSpan[], +): number { + for (const rule of stateRules) { + // Handle include directives + if ('include' in rule) { + const includedStateName = rule.include.startsWith('@') ? rule.include.slice(1) : rule.include; + const includedRules = rules.tokenizer[includedStateName]; + if (includedRules) { + const result = tryMatchRules(input, pos, includedRules, stateStack, rules, tokens); + if (result > 0) { + return result; + } + } + continue; + } + + // Try to match the rule's regex at the current position + const regex = anchorRegex(rule.regex); + regex.lastIndex = pos; + const match = regex.exec(input); + + if (!match || match.index !== pos) { + continue; + } + + const matchedText = match[0]; + + // Zero-length match — skip to prevent infinite loops + if (matchedText.length === 0) { + continue; + } + + // Determine token type(s) and state transition + if ('actionByGroup' in rule) { + // Group-based action: one token type per capture group + emitGroupTokens(match, pos, rule.actionByGroup, tokens); + if (rule.next) { + applyStateTransition(rule.next, stateStack); + } + } else if ('actionCases' in rule) { + // Case-based action: look up matched text in named arrays + const tokenType = resolveCases(matchedText, rule.actionCases, rules); + if (tokenType.length > 0) { + tokens.push({ start: pos, end: pos + matchedText.length, type: tokenType }); + } + if (rule.next) { + applyStateTransition(rule.next, stateStack); + } + } else { + // Simple action: emit a single token + const tokenType = resolveAction(rule.action); + if (tokenType.length > 0) { + tokens.push({ start: pos, end: pos + matchedText.length, type: tokenType }); + } + if ('next' in rule && rule.next) { + applyStateTransition(rule.next, stateStack); + } + } + + return pos + matchedText.length; + } + + return 0; +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +/** + * Resolve the `@` prefix in action strings. + * `@brackets` → `delimiter.bracket`, other `@` prefixes → strip the `@`. + */ +function resolveAction(action: string): string { + if (action === '@brackets') { + return 'delimiter.bracket'; + } + if (action.startsWith('@')) { + return action.slice(1); + } + return action; +} + +/** + * Resolve a `cases` lookup: check if the matched text is in a named array. + */ +function resolveCases(matchedText: string, cases: Record, rules: MonarchLanguageRules): string { + for (const [key, tokenType] of Object.entries(cases)) { + if (key === '@default') { + continue; + } + + // Look up the named array in the rules object + const arrayName = key.startsWith('@') ? key.slice(1) : key; + const array = rules[arrayName as keyof MonarchLanguageRules]; + + if (Array.isArray(array) && (array as string[]).includes(matchedText)) { + return resolveAction(tokenType); + } + } + + // Fall through to @default + const defaultType = cases['@default']; + if (defaultType !== undefined) { + return resolveAction(defaultType); + } + + return ''; +} + +/** + * Emit one token per capture group from a grouped action rule. + * Groups are matched positionally: group 1 → actionByGroup[0], etc. + * If a capture group is empty, no token is emitted for it. + */ +function emitGroupTokens(match: RegExpExecArray, basePos: number, actions: string[], tokens: TokenSpan[]): void { + let offset = basePos; + for (let i = 0; i < actions.length; i++) { + const groupText = match[i + 1]; // capture groups are 1-indexed + if (groupText === undefined || groupText.length === 0) { + continue; + } + + // Find the actual position of this group's text within the match + const groupStart = inputIndexOf(match[0], groupText, offset - basePos) + basePos; + const tokenType = resolveAction(actions[i]); + if (tokenType.length > 0) { + tokens.push({ start: groupStart, end: groupStart + groupText.length, type: tokenType }); + } + offset = groupStart + groupText.length; + } +} + +/** + * Find the index of `needle` in `haystack` starting from `fromIndex`. + * Used instead of String.indexOf to handle the case where the needle + * appears multiple times in the full match. + */ +function inputIndexOf(haystack: string, needle: string, fromIndex: number): number { + const idx = haystack.indexOf(needle, fromIndex); + return idx >= 0 ? idx : fromIndex; +} + +/** + * Apply a state transition. + */ +function applyStateTransition(next: string, stateStack: string[]): void { + const stateName = next.startsWith('@') ? next.slice(1) : next; + + if (stateName === 'pop') { + if (stateStack.length > 1) { + stateStack.pop(); + } + } else { + if (stateStack.length < MAX_STACK_DEPTH) { + stateStack.push(stateName); + } + } +} + +// ─── Regex anchoring cache ─────────────────────────────────────────────────── + +const anchoredRegexCache = new WeakMap(); + +/** + * Return a sticky (`y`-flag) version of the regex so it only matches + * at the current `lastIndex` position. + */ +function anchorRegex(regex: RegExp): RegExp { + let anchored = anchoredRegexCache.get(regex); + if (!anchored) { + const flags = regex.flags.includes('y') ? regex.flags : regex.flags + 'y'; + anchored = new RegExp(regex.source, flags); + anchoredRegexCache.set(regex, anchored); + } + return anchored; +} + +// ─── Token merging ─────────────────────────────────────────────────────────── + +/** + * Merge adjacent tokens of the same type to produce cleaner output. + * For example, consecutive "comment" tokens from multi-character comment rules + * become a single span. + */ +function mergeAdjacentTokens(tokens: TokenSpan[]): TokenSpan[] { + if (tokens.length <= 1) { + return tokens; + } + + const merged: TokenSpan[] = [tokens[0]]; + + for (let i = 1; i < tokens.length; i++) { + const prev = merged[merged.length - 1]; + const curr = tokens[i]; + + if (curr.type === prev.type && curr.start === prev.end) { + prev.end = curr.end; + } else { + merged.push(curr); + } + } + + return merged; +} diff --git a/src/documentdb/shell/highlighting/tokenColorizer.test.ts b/src/documentdb/shell/highlighting/tokenColorizer.test.ts new file mode 100644 index 000000000..56fddcd54 --- /dev/null +++ b/src/documentdb/shell/highlighting/tokenColorizer.test.ts @@ -0,0 +1,188 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +import { type TokenSpan } from './monarchRunner'; +import { colorizeInput } from './tokenColorizer'; + +const RESET = '\x1b[0m'; +const CYAN = '\x1b[36m'; +const GREEN = '\x1b[32m'; +const YELLOW = '\x1b[33m'; +const MAGENTA = '\x1b[35m'; +const GRAY = '\x1b[90m'; +const RED = '\x1b[31m'; + +/** Helper: create a TokenSpan. */ +function span(start: number, end: number, type: string): TokenSpan { + return { start, end, type }; +} + +describe('tokenColorizer', () => { + describe('keywords', () => { + it('should colorize keywords in cyan', () => { + const result = colorizeInput('const', [span(0, 5, 'keyword')]); + expect(result).toBe(`${CYAN}const${RESET}`); + }); + }); + + describe('strings', () => { + it('should colorize strings in green', () => { + const result = colorizeInput('"hello"', [span(0, 7, 'string')]); + expect(result).toBe(`${GREEN}"hello"${RESET}`); + }); + }); + + describe('string escapes', () => { + it('should colorize escape sequences in yellow', () => { + const result = colorizeInput('\\n', [span(0, 2, 'string.escape')]); + expect(result).toBe(`${YELLOW}\\n${RESET}`); + }); + }); + + describe('invalid strings', () => { + it('should colorize unterminated strings in red', () => { + const result = colorizeInput('"hello', [span(0, 6, 'string.invalid')]); + expect(result).toBe(`${RED}"hello${RESET}`); + }); + }); + + describe('numbers', () => { + it('should colorize integers in yellow', () => { + const result = colorizeInput('42', [span(0, 2, 'number')]); + expect(result).toBe(`${YELLOW}42${RESET}`); + }); + + it('should colorize floats in yellow', () => { + const result = colorizeInput('3.14', [span(0, 4, 'number.float')]); + expect(result).toBe(`${YELLOW}3.14${RESET}`); + }); + + it('should colorize hex numbers in yellow', () => { + const result = colorizeInput('0xFF', [span(0, 4, 'number.hex')]); + expect(result).toBe(`${YELLOW}0xFF${RESET}`); + }); + }); + + describe('comments', () => { + it('should colorize line comments in gray', () => { + const result = colorizeInput('// comment', [span(0, 10, 'comment')]); + expect(result).toBe(`${GRAY}// comment${RESET}`); + }); + + it('should colorize doc comments in gray', () => { + const result = colorizeInput('/** doc */', [span(0, 10, 'comment.doc')]); + expect(result).toBe(`${GRAY}/** doc */${RESET}`); + }); + }); + + describe('regex', () => { + it('should colorize regex in red', () => { + const result = colorizeInput('/hello/', [span(0, 7, 'regexp')]); + expect(result).toBe(`${RED}/hello/${RESET}`); + }); + }); + + describe('BSON constructors', () => { + it('should colorize BSON constructors in cyan', () => { + const result = colorizeInput('ObjectId', [span(0, 8, 'bson.constructor')]); + expect(result).toBe(`${CYAN}ObjectId${RESET}`); + }); + }); + + describe('DocumentDB API operators', () => { + it('should colorize DocumentDB operators in yellow', () => { + const result = colorizeInput('$gt', [span(0, 3, 'documentdb.operator')]); + expect(result).toBe(`${YELLOW}$gt${RESET}`); + }); + }); + + describe('shell commands', () => { + it('should colorize shell commands in magenta', () => { + const result = colorizeInput('show', [span(0, 4, 'shell.command')]); + expect(result).toBe(`${MAGENTA}show${RESET}`); + }); + }); + + describe('uncolored tokens', () => { + it('should not colorize identifiers', () => { + const result = colorizeInput('foo', [span(0, 3, 'identifier')]); + expect(result).toBe('foo'); + }); + + it('should not colorize type.identifier', () => { + const result = colorizeInput('MyClass', [span(0, 7, 'type.identifier')]); + expect(result).toBe('MyClass'); + }); + + it('should not colorize delimiters', () => { + const result = colorizeInput(';', [span(0, 1, 'delimiter')]); + expect(result).toBe(';'); + }); + + it('should not colorize delimiter.bracket', () => { + const result = colorizeInput('{', [span(0, 1, 'delimiter.bracket')]); + expect(result).toBe('{'); + }); + }); + + describe('empty input', () => { + it('should return empty string for empty input', () => { + expect(colorizeInput('', [])).toBe(''); + }); + }); + + describe('full line integration', () => { + it('should correctly colorize a mixed line', () => { + const input = 'db.users.find({ $gt: 1 })'; + // Simulate tokens for this input: + // db=identifier(0-2), .=delimiter(2-3), users=identifier(3-8), + // .=delimiter(8-9), find=identifier(9-13), (=bracket(13-14), + // {=bracket(14-15), $gt=documentdb.operator(16-19), :=delimiter(19-20), + // 1=number(21-22), }=bracket(23-24), )=bracket(24-25) + const tokens: TokenSpan[] = [ + span(0, 2, 'identifier'), // db + span(2, 3, 'delimiter'), // . + span(3, 8, 'identifier'), // users + span(8, 9, 'delimiter'), // . + span(9, 13, 'identifier'), // find + span(13, 14, 'delimiter.bracket'), // ( + span(14, 15, 'delimiter.bracket'), // { + span(16, 19, 'documentdb.operator'), // $gt + span(19, 20, 'delimiter'), // : + span(21, 22, 'number'), // 1 + span(23, 24, 'delimiter.bracket'), // } + span(24, 25, 'delimiter.bracket'), // ) + ]; + + const result = colorizeInput(input, tokens); + + // Verify key colorized parts are present + expect(result).toContain(`${YELLOW}$gt${RESET}`); + expect(result).toContain(`${YELLOW}1${RESET}`); + // Uncolored parts should appear without ANSI codes + expect(result).toContain('db'); + expect(result).toContain('users'); + expect(result).toContain('find'); + }); + }); + + describe('gaps between tokens', () => { + it('should emit uncolored text for gaps between tokens', () => { + const input = 'a b'; + const tokens: TokenSpan[] = [span(0, 1, 'identifier'), span(2, 3, 'identifier')]; + const result = colorizeInput(input, tokens); + expect(result).toBe('a b'); + }); + }); + + describe('trailing text after last token', () => { + it('should emit trailing text after the last token', () => { + const input = 'abc'; + const tokens: TokenSpan[] = [span(0, 2, 'keyword')]; + const result = colorizeInput(input, tokens); + expect(result).toBe(`${CYAN}ab${RESET}c`); + }); + }); +}); diff --git a/src/documentdb/shell/highlighting/tokenColorizer.ts b/src/documentdb/shell/highlighting/tokenColorizer.ts new file mode 100644 index 000000000..61ca789f7 --- /dev/null +++ b/src/documentdb/shell/highlighting/tokenColorizer.ts @@ -0,0 +1,94 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +/** + * Converts Monarch token spans to an ANSI-colorized string for terminal display. + * + * The color palette matches the conventions established by {@link ShellOutputFormatter} + * so that input and output highlighting are visually consistent. + */ + +import { type TokenSpan } from './monarchRunner'; + +// ─── ANSI escape codes ────────────────────────────────────────────────────── + +const RESET = '\x1b[0m'; + +/** Map from token type prefix to ANSI color code. */ +const TOKEN_COLORS: Record = { + keyword: '\x1b[36m', // Cyan — JS keywords + 'keyword.other': '\x1b[36m', // Cyan — regex flags (i, g, m, etc.) + string: '\x1b[32m', // Green — matches output formatter + 'string.escape': '\x1b[33m', // Yellow — escape sequences stand out + 'string.escape.invalid': '\x1b[31m', // Red — invalid escapes + 'string.invalid': '\x1b[31m', // Red — unterminated strings + number: '\x1b[33m', // Yellow — matches output formatter + 'number.float': '\x1b[33m', // Yellow + 'number.hex': '\x1b[33m', // Yellow + 'number.octal': '\x1b[33m', // Yellow + 'number.binary': '\x1b[33m', // Yellow + comment: '\x1b[90m', // Gray — subdued + 'comment.doc': '\x1b[90m', // Gray — subdued + regexp: '\x1b[31m', // Red — distinct from strings + 'regexp.escape': '\x1b[31m', // Red + 'regexp.escape.control': '\x1b[31m', // Red + 'regexp.invalid': '\x1b[31m', // Red + 'bson.constructor': '\x1b[36m', // Cyan — built-in constructors + 'documentdb.operator': '\x1b[33m', // Yellow — stand out in query objects + 'shell.command': '\x1b[35m', // Magenta — visually distinct from JS keywords +}; + +// ─── Public API ────────────────────────────────────────────────────────────── + +/** + * Convert token spans and the original input into an ANSI-colorized string. + * + * @param input - The original input string. + * @param tokens - Token spans from {@link tokenize}. + * @returns The input string with ANSI color codes inserted. + */ +export function colorizeInput(input: string, tokens: TokenSpan[]): string { + if (input.length === 0 || tokens.length === 0) { + return input; + } + + let result = ''; + let pos = 0; + + for (const token of tokens) { + // Emit any gap before this token (shouldn't happen, but guard defensively) + if (token.start > pos) { + result += input.slice(pos, token.start); + } + + const text = input.slice(token.start, token.end); + const color = getTokenColor(token.type); + + if (color) { + result += color + text + RESET; + } else { + result += text; + } + + pos = token.end; + } + + // Emit any trailing text after the last token + if (pos < input.length) { + result += input.slice(pos); + } + + return result; +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +/** + * Look up the ANSI color code for a token type. + * Returns undefined if the token type should not be colorized. + */ +function getTokenColor(tokenType: string): string | undefined { + return TOKEN_COLORS[tokenType]; +} diff --git a/src/documentdb/shell/shellHighlighter.test.ts b/src/documentdb/shell/shellHighlighter.test.ts new file mode 100644 index 000000000..fcbc30043 --- /dev/null +++ b/src/documentdb/shell/shellHighlighter.test.ts @@ -0,0 +1,300 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +import { colorizeShellInput } from './highlighting/colorizeShellInput'; +import { ShellInputHandler, type ShellInputHandlerCallbacks } from './ShellInputHandler'; + +const CYAN = '\x1b[36m'; +const GREEN = '\x1b[32m'; +const YELLOW = '\x1b[33m'; +const MAGENTA = '\x1b[35m'; +const RESET = '\x1b[0m'; + +/** + * Create a ShellInputHandler wired up with the real highlighting pipeline. + * Returns the handler and accessors for test assertions. + */ +function createHighlightedHandler(): { + handler: ShellInputHandler; + getWritten: () => string; + getLastRender: () => string; + getLines: () => string[]; + getContinuations: () => number; +} { + let written = ''; + const lines: string[] = []; + let continuations = 0; + + const callbacks: ShellInputHandlerCallbacks = { + write: (data: string) => { + written += data; + }, + onLine: (line: string) => { + lines.push(line); + }, + onInterrupt: () => { + // no-op for tests + }, + onContinuation: () => { + continuations++; + }, + colorize: colorizeShellInput, + }; + + const handler = new ShellInputHandler(callbacks); + handler.setPromptWidth(5); // e.g., "db> " (5 chars) + + return { + handler, + getWritten: () => written, + getLastRender: () => { + // Extract the last re-render (after the last \r) + const lastCR = written.lastIndexOf('\r'); + return lastCR >= 0 ? written.slice(lastCR) : written; + }, + getLines: () => lines, + getContinuations: () => continuations, + }; +} + +describe('Shell Syntax Highlighting (integrated)', () => { + describe('typing a keyword', () => { + it('should highlight "const" as cyan after typing all characters', () => { + const { handler, getLastRender } = createHighlightedHandler(); + + handler.handleInput('const'); + + const render = getLastRender(); + expect(render).toContain(`${CYAN}const${RESET}`); + }); + + it('should not highlight partial keyword "con"', () => { + const { handler, getLastRender } = createHighlightedHandler(); + + handler.handleInput('con'); + + const render = getLastRender(); + // "con" is not a keyword — should appear without cyan + expect(render).not.toContain(`${CYAN}con${RESET}`); + expect(render).toContain('con'); + }); + }); + + describe('typing a string', () => { + it('should highlight a complete double-quoted string in green', () => { + const { handler, getLastRender } = createHighlightedHandler(); + + handler.handleInput('"hello"'); + + const render = getLastRender(); + expect(render).toContain(GREEN); + expect(render).toContain('hello'); + }); + }); + + describe('typing a BSON constructor', () => { + it('should highlight ObjectId in cyan', () => { + const { handler, getLastRender } = createHighlightedHandler(); + + handler.handleInput('ObjectId'); + + const render = getLastRender(); + expect(render).toContain(`${CYAN}ObjectId${RESET}`); + }); + }); + + describe('typing a DocumentDB operator', () => { + it('should highlight $gt in yellow', () => { + const { handler, getLastRender } = createHighlightedHandler(); + + handler.handleInput('$gt'); + + const render = getLastRender(); + expect(render).toContain(`${YELLOW}$gt${RESET}`); + }); + }); + + describe('typing a shell command', () => { + it('should highlight "show" in magenta', () => { + const { handler, getLastRender } = createHighlightedHandler(); + + handler.handleInput('show'); + + const render = getLastRender(); + expect(render).toContain(`${MAGENTA}show${RESET}`); + }); + }); + + describe('typing a number', () => { + it('should highlight numbers in yellow', () => { + const { handler, getLastRender } = createHighlightedHandler(); + + handler.handleInput('42'); + + const render = getLastRender(); + expect(render).toContain(`${YELLOW}42${RESET}`); + }); + }); + + describe('backspace mid-word', () => { + it('should update highlighting after backspace changes a keyword to non-keyword', () => { + const { handler, getLastRender } = createHighlightedHandler(); + + // Type "const" + handler.handleInput('const'); + let render = getLastRender(); + expect(render).toContain(`${CYAN}const${RESET}`); + + // Backspace twice → "con" + handler.handleInput('\x7f\x7f'); + render = getLastRender(); + // "con" is not a keyword — no cyan + expect(render).not.toContain(`${CYAN}con${RESET}`); + + // Type "le" → "conle" + handler.handleInput('le'); + render = getLastRender(); + // "conle" is not a keyword — no cyan + expect(render).not.toContain(CYAN); + }); + }); + + describe('history recall', () => { + it('should highlight recalled lines', () => { + const { handler, getLastRender } = createHighlightedHandler(); + + // Type and submit "const x = 1" + handler.handleInput('const x = 1\r'); + handler.resetLine(); + + // Recall with Up arrow + handler.handleInput('\x1b[A'); + + const render = getLastRender(); + // The recalled line should be highlighted + expect(render).toContain(`${CYAN}const${RESET}`); + expect(render).toContain(`${YELLOW}1${RESET}`); + }); + }); + + describe('clear line (Ctrl+U)', () => { + it('should produce empty output after Ctrl+U', () => { + const { handler, getLastRender } = createHighlightedHandler(); + + handler.handleInput('db.find()'); + handler.handleInput('\x15'); // Ctrl+U + + expect(handler.getBuffer()).toBe(''); + // After clearing, the render should not contain any colored content + const render = getLastRender(); + expect(render).not.toContain(CYAN); + expect(render).not.toContain(GREEN); + expect(render).not.toContain(YELLOW); + expect(render).not.toContain(MAGENTA); + }); + }); + + describe('mixed expression', () => { + it('should correctly highlight db.users.find({ $gt: 1 })', () => { + const { handler, getLastRender } = createHighlightedHandler(); + + handler.handleInput('db.users.find({ $gt: 1 })'); + + const render = getLastRender(); + // $gt should be yellow (DocumentDB operator) + expect(render).toContain(`${YELLOW}$gt${RESET}`); + // 1 should be yellow (number) + expect(render).toContain(`${YELLOW}1${RESET}`); + // db, users, find are identifiers — no special color + }); + }); + + describe('colorize disabled', () => { + it('should not add ANSI codes when colorize is not provided', () => { + let written = ''; + const callbacks: ShellInputHandlerCallbacks = { + write: (data: string) => { + written += data; + }, + onLine: () => {}, + onInterrupt: () => {}, + onContinuation: () => {}, + // No colorize callback + }; + + const handler = new ShellInputHandler(callbacks); + handler.setPromptWidth(5); + handler.handleInput('const x = 1'); + + // Should not contain any ANSI color codes + expect(written).not.toContain('\x1b[36m'); // Cyan + expect(written).not.toContain('\x1b[33m'); // Yellow + expect(written).toContain('const x = 1'); + }); + + it('should not add ANSI codes when colorize returns input unchanged (setting disabled)', () => { + let written = ''; + const callbacks: ShellInputHandlerCallbacks = { + write: (data: string) => { + written += data; + }, + onLine: () => {}, + onInterrupt: () => {}, + onContinuation: () => {}, + // Simulates the PTY callback when documentDB.shell.display.colorOutput = false + colorize: (input: string) => input, + }; + + const handler = new ShellInputHandler(callbacks); + handler.setPromptWidth(5); + handler.handleInput('const x = 1'); + + expect(written).not.toContain('\x1b[36m'); // Cyan + expect(written).not.toContain('\x1b[33m'); // Yellow + expect(written).not.toContain('\x1b[32m'); // Green + expect(written).not.toContain('\x1b[35m'); // Magenta + expect(written).not.toContain('\x1b[90m'); // Gray + expect(written).not.toContain('\x1b[31m'); // Red + expect(written).toContain('const x = 1'); + }); + + it('should not colorize any token types when setting is disabled', () => { + let written = ''; + const callbacks: ShellInputHandlerCallbacks = { + write: (data: string) => { + written += data; + }, + onLine: () => {}, + onInterrupt: () => {}, + onContinuation: () => {}, + colorize: (input: string) => input, + }; + + const handler = new ShellInputHandler(callbacks); + handler.setPromptWidth(5); + + // Type input that contains every color category + handler.handleInput('show dbs'); // shell command (magenta) + written = ''; + handler.handleInput('\x15'); // Ctrl+U to clear + handler.handleInput('ObjectId("abc")'); // BSON (cyan) + string (green) + const render1 = written; + written = ''; + handler.handleInput('\x15'); + handler.handleInput('{ $gt: 42 }'); // operator (yellow) + number (yellow) + const render2 = written; + written = ''; + handler.handleInput('\x15'); + handler.handleInput('// comment'); // comment (gray) + const render3 = written; + + // None of the renders should contain any ANSI color escape + for (const render of [render1, render2, render3]) { + expect(render).not.toContain('\x1b[3'); // No 30-series colors + expect(render).not.toContain('\x1b[9'); // No 90-series colors (gray) + } + }); + }); +}); diff --git a/src/documentdb/shell/terminalDisplayWidth.ts b/src/documentdb/shell/terminalDisplayWidth.ts new file mode 100644 index 000000000..274c9c64e --- /dev/null +++ b/src/documentdb/shell/terminalDisplayWidth.ts @@ -0,0 +1,60 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +/** + * Compute the display-column width of a string for the terminal. + * + * JavaScript's `String.length` counts UTF-16 code units, but ANSI cursor + * movement operates on display columns. Surrogate pairs (emoji, symbols + * above U+FFFF) are 2 code units but typically 1–2 terminal columns. + * + * This uses `Intl.Segmenter` (available in Node 16+) to iterate grapheme + * clusters and counts each one as 1 column unless it is a known + * full-width/wide character (CJK Unified Ideographs, etc.). + */ +export function terminalDisplayWidth(text: string): number { + // Fast path: ASCII-only strings (common case) + if (/^[\x20-\x7e]*$/.test(text)) { + return text.length; + } + + let width = 0; + // Use Intl.Segmenter to properly iterate grapheme clusters + // This handles surrogate pairs, combining marks, ZWJ sequences, etc. + const segmenter = new Intl.Segmenter(undefined, { granularity: 'grapheme' }); + for (const { segment } of segmenter.segment(text)) { + const cp = segment.codePointAt(0) ?? 0; + // Full-width / wide characters occupy 2 columns + if (isWideCharacter(cp)) { + width += 2; + } else { + width += 1; + } + } + + return width; +} + +/** + * Returns true for code points that occupy 2 terminal columns. + * Covers CJK Unified Ideographs and common full-width ranges. + */ +function isWideCharacter(cp: number): boolean { + return ( + (cp >= 0x1100 && cp <= 0x115f) || // Hangul Jamo + (cp >= 0x2e80 && cp <= 0x303e) || // CJK Radicals, Kangxi, CJK Symbols + (cp >= 0x3040 && cp <= 0x33bf) || // Hiragana, Katakana, Bopomofo, etc. + (cp >= 0x3400 && cp <= 0x4dbf) || // CJK Unified Ideographs Extension A + (cp >= 0x4e00 && cp <= 0xa4cf) || // CJK Unified Ideographs + Yi + (cp >= 0xa960 && cp <= 0xa97c) || // Hangul Jamo Extended-A + (cp >= 0xac00 && cp <= 0xd7a3) || // Hangul Syllables + (cp >= 0xf900 && cp <= 0xfaff) || // CJK Compatibility Ideographs + (cp >= 0xfe30 && cp <= 0xfe6f) || // CJK Compatibility Forms + (cp >= 0xff01 && cp <= 0xff60) || // Fullwidth Forms + (cp >= 0xffe0 && cp <= 0xffe6) || // Fullwidth Signs + (cp >= 0x20000 && cp <= 0x2fffd) || // CJK Unified Ideographs Extension B+ + (cp >= 0x30000 && cp <= 0x3fffd) // CJK Unified Ideographs Extension G+ + ); +}