diff --git a/.changeset/smart-chunking-rewrite.md b/.changeset/smart-chunking-rewrite.md new file mode 100644 index 0000000..439c292 --- /dev/null +++ b/.changeset/smart-chunking-rewrite.md @@ -0,0 +1,7 @@ +--- +"@chkit/plugin-backfill": patch +"@chkit/clickhouse": patch +"chkit": patch +--- + +Rewrite backfill chunk planning with multi-strategy smart chunking. The planner now introspects partition layout, sort key distribution, and row estimates to produce better-sized chunks using strategies like equal-width splitting, quantile ranges, temporal bucketing, string prefix splitting, and group-by-key splitting. Adds a dedicated `sdk` entry point for programmatic access to chunking internals. diff --git a/.changeset/structured-backfill-logging.md b/.changeset/structured-backfill-logging.md new file mode 100644 index 0000000..1c795e2 --- /dev/null +++ b/.changeset/structured-backfill-logging.md @@ -0,0 +1,6 @@ +--- +"chkit": patch +"@chkit/plugin-backfill": patch +--- + +Add structured logging to backfill chunk planning via `@logtape/logtape`. The smart chunking planner now logs introspection, partition planning, and per-strategy split decisions, and emits warnings when ClickHouse queries exceed 5s. Enable with `CHKIT_DEBUG=1`. diff --git a/bun.lock b/bun.lock index 1191f8c..2f62810 100644 --- a/bun.lock +++ b/bun.lock @@ -23,7 +23,7 @@ }, "apps/docs": { "name": "@chkit/docs", - "version": "0.0.2-beta.8", + "version": "0.0.2-beta.9", "dependencies": { "@astrojs/starlight": "^0.37.6", "astro": "^5.6.1", @@ -33,7 +33,7 @@ }, "packages/cli": { "name": "chkit", - "version": "0.1.0-beta.19", + "version": "0.1.0-beta.20", "bin": { "chkit": "./dist/bin/chkit.js", }, @@ -42,28 +42,30 @@ "@chkit/codegen": "workspace:*", "@chkit/core": "workspace:*", "@clickhouse/client": "^1.11.0", + "@logtape/logtape": "^2.0.5", "fast-glob": "^3.3.2", }, }, "packages/clickhouse": { "name": "@chkit/clickhouse", - "version": "0.1.0-beta.19", + "version": "0.1.0-beta.20", "dependencies": { "@chkit/core": "workspace:*", "@clickhouse/client": "^1.11.0", + "@logtape/logtape": "^2.0.5", "p-retry": "^7.1.1", }, }, "packages/codegen": { "name": "@chkit/codegen", - "version": "0.1.0-beta.19", + "version": "0.1.0-beta.20", "dependencies": { "@chkit/core": "workspace:*", }, }, "packages/core": { "name": "@chkit/core", - "version": "0.1.0-beta.19", + "version": "0.1.0-beta.20", "dependencies": { "fast-glob": "^3.3.2", }, @@ -73,17 +75,18 @@ }, "packages/plugin-backfill": { "name": "@chkit/plugin-backfill", - "version": "0.1.0-beta.19", + "version": "0.1.0-beta.20", "dependencies": { "@chkit/clickhouse": "workspace:*", "@chkit/core": "workspace:*", + "@logtape/logtape": "^2.0.5", "p-map": "^7.0.4", "zod": "^4.3.6", }, }, "packages/plugin-codegen": { "name": "@chkit/plugin-codegen", - "version": "0.1.0-beta.19", + "version": "0.1.0-beta.20", "dependencies": { "@chkit/core": "workspace:*", "zod": "^4.3.6", @@ -91,8 +94,9 @@ }, "packages/plugin-obsessiondb": { "name": "@chkit/plugin-obsessiondb", - "version": "0.1.0-beta.19", + "version": "0.1.0-beta.20", "dependencies": { + "@chkit/clickhouse": "workspace:*", "@chkit/core": "workspace:*", "@orpc/client": "1.13.4", "@orpc/contract": "1.13.4", @@ -101,7 +105,7 @@ }, "packages/plugin-pull": { "name": "@chkit/plugin-pull", - "version": "0.1.0-beta.19", + "version": "0.1.0-beta.20", "dependencies": { "@chkit/clickhouse": "workspace:*", "@chkit/core": "workspace:*", @@ -348,6 +352,8 @@ "@jridgewell/trace-mapping": ["@jridgewell/trace-mapping@0.3.9", "", { "dependencies": { "@jridgewell/resolve-uri": "^3.0.3", "@jridgewell/sourcemap-codec": "^1.4.10" } }, "sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ=="], + "@logtape/logtape": ["@logtape/logtape@2.0.5", "", {}, "sha512-UizDkh20ZPJVOddRxG1F77WhHdlNl/sbQgoO8T534R7XvUBMAJ9En9f35u+meW2tRsNLvjz6R87Zanwf53tspQ=="], + "@manypkg/find-root": ["@manypkg/find-root@1.1.0", "", { "dependencies": { "@babel/runtime": "^7.5.5", "@types/node": "^12.7.1", "find-up": "^4.1.0", "fs-extra": "^8.1.0" } }, "sha512-mki5uBvhHzO8kYYix/WRy2WX8S3B5wdVSc9D6KcU5lQNglP2yt58/VfLuAK49glRXChosY8ap2oJ1qgma3GUVA=="], "@manypkg/get-packages": ["@manypkg/get-packages@1.1.3", "", { "dependencies": { "@babel/runtime": "^7.5.5", "@changesets/types": "^4.0.1", "@manypkg/find-root": "^1.1.0", "fs-extra": "^8.1.0", "globby": "^11.0.0", "read-yaml-file": "^1.1.0" } }, "sha512-fo+QhuU3qE/2TQMQmbVMqaQ6EWbMhi4ABWP+O4AM1NqPBuy0OrApV5LO6BrrgnhtAHS2NH6RrVk9OL181tTi8A=="], diff --git a/packages/cli/package.json b/packages/cli/package.json index 0b969b7..0d145c2 100644 --- a/packages/cli/package.json +++ b/packages/cli/package.json @@ -43,6 +43,7 @@ "@chkit/clickhouse": "workspace:*", "@chkit/codegen": "workspace:*", "@chkit/core": "workspace:*", + "@logtape/logtape": "^2.0.5", "fast-glob": "^3.3.2" } } diff --git a/packages/cli/src/bin/chkit.ts b/packages/cli/src/bin/chkit.ts index 49e300c..914c506 100644 --- a/packages/cli/src/bin/chkit.ts +++ b/packages/cli/src/bin/chkit.ts @@ -17,6 +17,7 @@ import { loadPluginRuntime } from './plugin-runtime.js' import { getInternalPlugins } from './internal-plugins/index.js' import { CLI_VERSION } from './version.js' import { debug } from './debug.js' +import { configureCliLogging } from './logging.js' const WELL_KNOWN_PLUGIN_COMMANDS: Record = { codegen: 'Codegen', @@ -73,6 +74,8 @@ function collectPluginCommands(runtime: Awaited { + configureCliLogging() + const argv = process.argv.slice(2) const commandName = argv[0] debug('cli', `chkit ${CLI_VERSION} — argv: [${argv.join(', ')}]`) diff --git a/packages/cli/src/bin/debug.ts b/packages/cli/src/bin/debug.ts index 5893a18..7f8d3d3 100644 --- a/packages/cli/src/bin/debug.ts +++ b/packages/cli/src/bin/debug.ts @@ -1,22 +1,15 @@ -import process from 'node:process' +import { getLogger } from '@logtape/logtape' -const enabled = process.env.CHKIT_DEBUG === '1' || process.env.CHKIT_DEBUG === 'true' - -function timestamp(): string { - const now = new Date() - return now.toISOString().slice(11, 23) // HH:mm:ss.SSS -} +import { isDebugEnabled } from './logging.js' export function debug(category: string, message: string, detail?: unknown): void { - if (!enabled) return - const prefix = `[chkit:${category}]` + if (!isDebugEnabled()) return + const logger = getLogger(['chkit', category]) if (detail !== undefined) { - console.error(`${timestamp()} ${prefix} ${message}`, detail) - } else { - console.error(`${timestamp()} ${prefix} ${message}`) + logger.debug(message, { detail }) + return } + logger.debug(message) } -export function isDebugEnabled(): boolean { - return enabled -} +export { isDebugEnabled } from './logging.js' diff --git a/packages/cli/src/bin/logging.ts b/packages/cli/src/bin/logging.ts new file mode 100644 index 0000000..e782027 --- /dev/null +++ b/packages/cli/src/bin/logging.ts @@ -0,0 +1,34 @@ +import process from 'node:process' + +import { configureSync, getConfig, getConsoleSink, getTextFormatter } from '@logtape/logtape' + +const enabled = process.env.CHKIT_DEBUG === '1' || process.env.CHKIT_DEBUG === 'true' + +export function configureCliLogging(): void { + if (!enabled || getConfig()) return + + configureSync({ + sinks: { + console: getConsoleSink({ + formatter: getTextFormatter({ timestamp: 'time' }), + }), + }, + loggers: [ + { + category: 'chkit', + sinks: ['console'], + lowestLevel: 'debug', + }, + { + category: 'logtape', + sinks: ['console'], + lowestLevel: 'error', + }, + ], + reset: true, + }) +} + +export function isDebugEnabled(): boolean { + return enabled +} diff --git a/packages/cli/src/plugin.test.ts b/packages/cli/src/plugin.test.ts index ddda0ed..8ff0a2b 100644 --- a/packages/cli/src/plugin.test.ts +++ b/packages/cli/src/plugin.test.ts @@ -40,7 +40,7 @@ async function waitForParts( database: string, table: string, expectedPartitions: number, - timeoutMs = 15_000, + timeoutMs = 60_000, ): Promise { const start = Date.now() while (Date.now() - start < timeoutMs) { diff --git a/packages/clickhouse/package.json b/packages/clickhouse/package.json index e09e9a4..41f51e4 100644 --- a/packages/clickhouse/package.json +++ b/packages/clickhouse/package.json @@ -46,6 +46,7 @@ "dependencies": { "@chkit/core": "workspace:*", "@clickhouse/client": "^1.11.0", + "@logtape/logtape": "^2.0.5", "p-retry": "^7.1.1" } } diff --git a/packages/clickhouse/src/index.ts b/packages/clickhouse/src/index.ts index a62dea6..0daccea 100644 --- a/packages/clickhouse/src/index.ts +++ b/packages/clickhouse/src/index.ts @@ -1,4 +1,4 @@ -import { createClient } from '@clickhouse/client' +import { createClient, type ClickHouseSettings } from '@clickhouse/client' import { normalizeSQLFragment, type ChxConfig, @@ -6,6 +6,7 @@ import { type ProjectionDefinition, type SkipIndexDefinition, } from '@chkit/core' +import { getLogger } from '@logtape/logtape' import { parseEngineFromCreateTableQuery, parseOrderByFromCreateTableQuery, @@ -28,9 +29,11 @@ export interface QueryStatus { error?: string } +export type { ClickHouseSettings } + export interface ClickHouseExecutor { command(sql: string): Promise - query(sql: string): Promise + query(sql: string, settings?: ClickHouseSettings): Promise insert>(params: { table: string; values: T[] }): Promise listSchemaObjects(): Promise listTableDetails(databases: string[]): Promise @@ -249,7 +252,54 @@ export { waitForTableAbsent, } from './ddl-propagation.js' +function parseSummaryFromHeaders(headers: Record): { + read_rows: string + read_bytes: string + written_rows: string + written_bytes: string + result_rows: string + result_bytes: string + elapsed_ns: string +} | undefined { + const raw = headers['x-clickhouse-summary'] + if (!raw || typeof raw !== 'string') return undefined + try { + return JSON.parse(raw) + } catch { + return undefined + } +} + +function logProfiling( + logger: ReturnType, + query: string, + queryId: string, + summary?: { + read_rows: string + read_bytes: string + written_rows: string + written_bytes: string + result_rows?: string + result_bytes?: string + elapsed_ns: string + }, +): void { + logger.trace('Query completed: {query}', { + query, + queryId, + readRows: Number(summary?.read_rows ?? 0), + readBytes: Number(summary?.read_bytes ?? 0), + writtenRows: Number(summary?.written_rows ?? 0), + writtenBytes: Number(summary?.written_bytes ?? 0), + elapsedMs: Number(summary?.elapsed_ns ?? 0) / 1_000_000, + resultRows: Number(summary?.result_rows ?? 0), + resultBytes: Number(summary?.result_bytes ?? 0), + }) +} + export function createClickHouseExecutor(config: NonNullable): ClickHouseExecutor { + const profiler = getLogger(['chkit', 'profiling']) + const client = createClient({ url: config.url, username: config.username, @@ -259,6 +309,7 @@ export function createClickHouseExecutor(config: NonNullable { try { - await client.command({ query: sql, http_headers: { 'X-DDL': '1' } }) + const result = await client.command({ query: sql, http_headers: { 'X-DDL': '1' } }) + logProfiling(profiler, sql, result.query_id, result.summary) } catch (error) { if (isUnknownDatabaseError(error)) { - // The configured database doesn't exist yet. Retry without the - // session database so that CREATE DATABASE can succeed. const fallback = createClient({ url: config.url, username: config.username, @@ -296,21 +346,24 @@ export function createClickHouseExecutor(config: NonNullable(sql: string): Promise { + async query(sql: string, settings?: ClickHouseSettings): Promise { try { - const result = await client.query({ query: sql, format: 'JSONEachRow', http_headers: { 'X-DDL': '1' } }) - return result.json() + const result = await client.query({ query: sql, format: 'JSONEachRow', http_headers: { 'X-DDL': '1' }, ...(settings ? { clickhouse_settings: settings } : {}) }) + const rows = await result.json() + logProfiling(profiler, sql, result.query_id, parseSummaryFromHeaders(result.response_headers)) + return rows } catch (error) { wrapConnectionError(error, config.url) } }, async insert>(params: { table: string; values: T[] }): Promise { try { - await client.insert({ + const result = await client.insert({ table: params.table, values: params.values, format: 'JSONEachRow', }) + logProfiling(profiler, `INSERT INTO ${params.table}`, result.query_id, result.summary) } catch (error) { wrapConnectionError(error, config.url) } @@ -327,7 +380,7 @@ export function createClickHouseExecutor(config: NonNullable { try { const running = await client.query({ - query: `SELECT read_rows, read_bytes, written_rows, written_bytes, elapsed FROM clusterAllReplicas('parallel_replicas', system.processes) WHERE user = currentUser() AND query_id = {qid:String} SETTINGS skip_unavailable_shards = 1`, + query: `SELECT read_rows, read_bytes, written_rows, written_bytes, elapsed FROM clusterAllReplicas('cluster', system.processes) WHERE user = currentUser() AND query_id = {qid:String} SETTINGS skip_unavailable_shards = 1`, query_params: { qid: queryId }, format: 'JSONEachRow', }) @@ -353,7 +406,7 @@ export function createClickHouseExecutor(config: NonNullable { + const result = await client.query({ + query: sql, + format: 'JSONEachRow', + clickhouse_settings: settings as Record, + }) + return result.json() +} + +const plan = await generateChunkPlan({ + database: 'analytics', + table: 'events', + from: '2025-01-01T00:00:00Z', + to: '2025-02-01T00:00:00Z', + targetChunkBytes: 1_000_000_000, // ~1 GiB per chunk + query, + // 'count' is exact but slower; 'explain-estimate' is faster but approximate + rowProbeStrategy: 'count', +}) + +console.log(`${plan.chunks.length} chunks, ${plan.totalRows.toLocaleString()} rows`) +``` + +### Execute chunks against a target + +`buildChunkExecutionSql` produces the per-chunk `INSERT … SELECT` and `executeBackfill` runs them with concurrency, polling, and progress callbacks. Persist the `progress` argument anywhere you like to support resume. + +```ts +import { createClickHouseExecutor } from '@chkit/clickhouse' +import { + buildChunkExecutionSql, + executeBackfill, + type BackfillProgress, +} from '@chkit/plugin-backfill/sdk' + +const executor = createClickHouseExecutor({ + url: process.env.CLICKHOUSE_URL!, + username: 'default', + password: process.env.CLICKHOUSE_PASSWORD!, + database: 'analytics', +}) + +const chunksById = new Map(plan.chunks.map((chunk) => [chunk.id, chunk])) +let saved: BackfillProgress | undefined // load from disk for resume + +const result = await executeBackfill({ + executor, + planId: plan.planId, + chunks: plan.chunks, + buildQuery: ({ id }) => + buildChunkExecutionSql({ + planId: plan.planId, + chunk: chunksById.get(id)!, + target: 'analytics.events_backfill', + table: plan.table, + }), + concurrency: 4, + pollIntervalMs: 5_000, + resumeFrom: saved, + onProgress: async (progress) => { + saved = progress + // persist to disk / state store + }, +}) + +console.log(`done=${result.completed} failed=${result.failed}`) +``` + +### Plan persistence + +Plans contain string boundaries that may include non-UTF-8 bytes (the planner uses `latin1`-encoded byte ranges for string sort keys), so JSON-serializing a `ChunkPlan` directly will lose information. Use the codec helpers when you need to round-trip a plan through storage: + +```ts +import { + encodeChunkPlanForPersistence, + decodeChunkPlanFromPersistence, +} from '@chkit/plugin-backfill/sdk' + +const json = JSON.stringify(encodeChunkPlanForPersistence(plan)) +// later … +const plan2 = decodeChunkPlanFromPersistence(JSON.parse(json)) +``` + +### Logging + +The planner emits structured logs via [`@logtape/logtape`](https://logtape.org/) under the `['chkit', 'backfill']` category. Configure a sink at process start to see them — slow-query warnings (>5 s) are emitted at `warning` level, planning progress at `info`, and per-strategy decisions at `debug`. + +```ts +import { configureSync, getConsoleSink, getTextFormatter } from '@chkit/plugin-backfill/sdk' + +configureSync({ + sinks: { console: getConsoleSink({ formatter: getTextFormatter({ timestamp: 'time' }) }) }, + loggers: [{ category: 'chkit', sinks: ['console'], lowestLevel: 'info' }], + reset: true, +}) +``` + +To capture every SQL statement the planner runs (with timing, server-side stats, and per-strategy classification), wrap your `query` function instead of relying solely on logging — the wrapper sees the raw SQL and settings on every call and can record query IDs, response headers, and durations alongside the structured logs. + ## License [MIT](../../LICENSE) diff --git a/packages/plugin-backfill/package.json b/packages/plugin-backfill/package.json index c4cbe63..87326bf 100644 --- a/packages/plugin-backfill/package.json +++ b/packages/plugin-backfill/package.json @@ -27,6 +27,11 @@ "source": "./src/index.ts", "types": "./dist/index.d.ts", "default": "./dist/index.js" + }, + "./sdk": { + "source": "./src/sdk.ts", + "types": "./dist/sdk.d.ts", + "default": "./dist/sdk.js" } }, "files": [ @@ -38,11 +43,14 @@ "typecheck": "tsc -p tsconfig.json --noEmit", "lint": "biome lint src", "test": "bun test src", + "test:env": "doppler run --project chkit --config ci -- bun test src", + "seed:env": "doppler run --project chkit --config ci -- bun run src/chunking/e2e/seed-datasets.script.ts", "clean": "rm -rf dist" }, "dependencies": { "@chkit/clickhouse": "workspace:*", "@chkit/core": "workspace:*", + "@logtape/logtape": "^2.0.5", "p-map": "^7.0.4", "zod": "^4.3.6" } diff --git a/packages/plugin-backfill/src/async-backfill.ts b/packages/plugin-backfill/src/async-backfill.ts index f393499..bbb7c3b 100644 --- a/packages/plugin-backfill/src/async-backfill.ts +++ b/packages/plugin-backfill/src/async-backfill.ts @@ -7,9 +7,9 @@ export interface BackfillOptions { /** Plan ID used as a namespace in deterministic query IDs */ planId: string /** The chunks to process (from buildChunks) */ - chunks: Array<{ id: string; from: string; to: string; [key: string]: unknown }> + chunks: Array<{ id: string; from?: string; to?: string; [key: string]: unknown }> /** Build the SQL for a given chunk. Called once per chunk at submit time. */ - buildQuery: (chunk: { id: string; from: string; to: string }) => string + buildQuery: (chunk: { id: string; from?: string; to?: string }) => string /** Max concurrent queries running on the server. Default: 3 */ concurrency?: number /** Polling interval in ms. Default: 5000 */ @@ -149,7 +149,7 @@ export async function syncProgress( const safePrefix = prefix.replace(/'/g, "''").replace(/%/g, '\\%').replace(/_/g, '\\_') const runningRows = await executor.query<{ query_id: string }>( - `SELECT query_id FROM clusterAllReplicas('parallel_replicas', system.processes) WHERE user = currentUser() AND query_id LIKE '${safePrefix}%' SETTINGS skip_unavailable_shards = 1` + `SELECT query_id FROM clusterAllReplicas('cluster', system.processes) WHERE user = currentUser() AND query_id LIKE '${safePrefix}%' SETTINGS skip_unavailable_shards = 1` ) const runningSet = new Set(runningRows.map((r) => r.query_id)) @@ -162,7 +162,7 @@ export async function syncProgress( exception: string }>( `SELECT query_id, type, written_rows, written_bytes, query_duration_ms, exception -FROM clusterAllReplicas('parallel_replicas', system.query_log) +FROM clusterAllReplicas('cluster', system.query_log) WHERE user = currentUser() AND query_id LIKE '${safePrefix}%' AND type IN ('QueryFinish', 'ExceptionWhileProcessing') diff --git a/packages/plugin-backfill/src/chunking/analyze.ts b/packages/plugin-backfill/src/chunking/analyze.ts index 7e051e1..3903b72 100644 --- a/packages/plugin-backfill/src/chunking/analyze.ts +++ b/packages/plugin-backfill/src/chunking/analyze.ts @@ -1,129 +1,15 @@ -import { hashId, randomPlanId } from '../state.js' +import { generateChunkPlan } from './planner.js' +import type { ChunkPlan, GenerateChunkPlanInput } from './types.js' -import { buildChunkBoundaries } from './build.js' -import { introspectTable, querySortKeyRanges } from './introspect.js' -import type { ChunkBoundary, PartitionInfo, PlannedChunk, SortKeyInfo } from './types.js' - -export interface AnalyzeAndChunkInput { - database: string - table: string - from?: string - to?: string - maxChunkBytes: number - requireIdempotencyToken: boolean - query: (sql: string) => Promise -} - -export interface AnalyzeAndChunkResult { - planId: string - partitions: PartitionInfo[] - sortKey?: SortKeyInfo - chunks: PlannedChunk[] -} +export type AnalyzeAndChunkInput = GenerateChunkPlanInput +export type AnalyzeAndChunkResult = ChunkPlan +export type AnalyzeTableInput = GenerateChunkPlanInput +export type AnalyzeTableResult = ChunkPlan export async function analyzeAndChunk(input: AnalyzeAndChunkInput): Promise { - const { partitions, sortKey, boundaries } = await analyzeTable({ - database: input.database, - table: input.table, - from: input.from, - to: input.to, - maxChunkBytes: input.maxChunkBytes, - query: input.query, - }) - - const planId = randomPlanId() - - const chunks = buildPlannedChunks({ - planId, - partitions, - boundaries, - requireIdempotencyToken: input.requireIdempotencyToken, - }) - - return { planId, partitions, sortKey, chunks } -} - -export interface AnalyzeTableInput { - database: string - table: string - from?: string - to?: string - maxChunkBytes: number - query: (sql: string) => Promise -} - -export interface AnalyzeTableResult { - partitions: PartitionInfo[] - sortKey?: SortKeyInfo - boundaries: ChunkBoundary[] + return generateChunkPlan(input) } export async function analyzeTable(input: AnalyzeTableInput): Promise { - const { partitions, sortKey } = await introspectTable({ - database: input.database, - table: input.table, - from: input.from, - to: input.to, - query: input.query, - }) - - const oversizedPartitionIds = partitions - .filter(p => p.bytesOnDisk > input.maxChunkBytes) - .map(p => p.partitionId) - - let sortKeyRanges: Map | undefined - if (sortKey && oversizedPartitionIds.length > 0) { - sortKeyRanges = await querySortKeyRanges({ - database: input.database, - table: input.table, - sortKeyColumn: sortKey.column, - partitionIds: oversizedPartitionIds, - query: input.query, - }) - } - - const boundaries = buildChunkBoundaries({ - partitions, - maxChunkBytes: input.maxChunkBytes, - sortKey, - sortKeyRanges, - }) - - return { partitions, sortKey, boundaries } -} - -export function buildPlannedChunks(input: { - planId: string - partitions: PartitionInfo[] - boundaries: ChunkBoundary[] - requireIdempotencyToken: boolean -}): PlannedChunk[] { - const chunks: PlannedChunk[] = [] - const partitionIndex = new Map() - - for (const boundary of input.boundaries) { - const idx = partitionIndex.get(boundary.partitionId) ?? 0 - partitionIndex.set(boundary.partitionId, idx + 1) - - const idSeed = `${input.planId}:${boundary.partitionId}:${idx}` - const chunkId = hashId(`chunk:${idSeed}`).slice(0, 16) - const token = input.requireIdempotencyToken ? hashId(`token:${idSeed}`) : '' - - const partition = input.partitions.find(p => p.partitionId === boundary.partitionId) - const from = boundary.sortKeyFrom ?? partition?.minTime ?? '' - const to = boundary.sortKeyTo ?? partition?.maxTime ?? '' - - chunks.push({ - id: chunkId, - partitionId: boundary.partitionId, - sortKeyFrom: boundary.sortKeyFrom, - sortKeyTo: boundary.sortKeyTo, - estimatedBytes: boundary.estimatedBytes, - idempotencyToken: token, - from, - to, - }) - } - - return chunks + return analyzeAndChunk(input) } diff --git a/packages/plugin-backfill/src/chunking/boundary-codec.ts b/packages/plugin-backfill/src/chunking/boundary-codec.ts new file mode 100644 index 0000000..73b8984 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/boundary-codec.ts @@ -0,0 +1,109 @@ +import type { + Chunk, + ChunkPlan, + ChunkRange, + FocusedValue, + SortKey, +} from './types.js' + +export function encodeBoundary( + value: string | undefined, + sortKey: SortKey | undefined, +): string | undefined { + if (value === undefined || sortKey === undefined) return value + if (sortKey.boundaryEncoding === 'hex-latin1') { + return Buffer.from(value, 'latin1').toString('hex') + } + return value +} + +export function decodeBoundary( + value: string | undefined, + sortKey: SortKey | undefined, +): string | undefined { + if (value === undefined || sortKey === undefined) return value + if (sortKey.boundaryEncoding === 'hex-latin1') { + return Buffer.from(value, 'hex').toString('latin1') + } + return value +} + +export function encodeRangesForPlan( + ranges: ChunkRange[], + sortKeys: SortKey[], +): ChunkRange[] { + return ranges.map((range) => ({ + dimensionIndex: range.dimensionIndex, + from: encodeBoundary(range.from, sortKeys[range.dimensionIndex]), + to: encodeBoundary(range.to, sortKeys[range.dimensionIndex]), + })) +} + +export function decodeRangesFromPlan( + ranges: ChunkRange[], + sortKeys: SortKey[], +): ChunkRange[] { + return ranges.map((range) => ({ + dimensionIndex: range.dimensionIndex, + from: decodeBoundary(range.from, sortKeys[range.dimensionIndex]), + to: decodeBoundary(range.to, sortKeys[range.dimensionIndex]), + })) +} + +function encodeFocusedValue( + focusedValue: FocusedValue | undefined, + sortKeys: SortKey[], +): FocusedValue | undefined { + if (!focusedValue) return undefined + return { + dimensionIndex: focusedValue.dimensionIndex, + value: encodeBoundary(focusedValue.value, sortKeys[focusedValue.dimensionIndex]) ?? focusedValue.value, + } +} + +function decodeFocusedValue( + focusedValue: FocusedValue | undefined, + sortKeys: SortKey[], +): FocusedValue | undefined { + if (!focusedValue) return undefined + return { + dimensionIndex: focusedValue.dimensionIndex, + value: decodeBoundary(focusedValue.value, sortKeys[focusedValue.dimensionIndex]) ?? focusedValue.value, + } +} + +export function encodeChunkForPlan(chunk: Chunk, sortKeys: SortKey[]): Chunk { + return { + ...chunk, + ranges: encodeRangesForPlan(chunk.ranges, sortKeys), + analysis: { + ...chunk.analysis, + focusedValue: encodeFocusedValue(chunk.analysis.focusedValue, sortKeys), + }, + } +} + +export function decodeChunkFromPlan(chunk: Chunk, sortKeys: SortKey[]): Chunk { + return { + ...chunk, + ranges: decodeRangesFromPlan(chunk.ranges, sortKeys), + analysis: { + ...chunk.analysis, + focusedValue: decodeFocusedValue(chunk.analysis.focusedValue, sortKeys), + }, + } +} + +export function encodeChunkPlanForPersistence(plan: ChunkPlan): ChunkPlan { + return { + ...plan, + chunks: plan.chunks.map((chunk) => encodeChunkForPlan(chunk, plan.table.sortKeys)), + } +} + +export function decodeChunkPlanFromPersistence(plan: ChunkPlan): ChunkPlan { + return { + ...plan, + chunks: plan.chunks.map((chunk) => decodeChunkFromPlan(chunk, plan.table.sortKeys)), + } +} diff --git a/packages/plugin-backfill/src/chunking/build.test.ts b/packages/plugin-backfill/src/chunking/build.test.ts deleted file mode 100644 index 71aa47e..0000000 --- a/packages/plugin-backfill/src/chunking/build.test.ts +++ /dev/null @@ -1,135 +0,0 @@ -import { describe, expect, test } from 'bun:test' - -import { buildChunkBoundaries } from './build.js' -import type { PartitionInfo, SortKeyInfo } from './types.js' - -const GiB = 1024 ** 3 - -describe('buildChunkBoundaries', () => { - test('small partition produces one chunk boundary', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 1000, bytesOnDisk: 5 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T23:59:59.000Z' }, - ] - - const boundaries = buildChunkBoundaries({ - partitions, - maxChunkBytes: 10 * GiB, - }) - - expect(boundaries).toHaveLength(1) - expect(boundaries[0]?.partitionId).toBe('202501') - expect(boundaries[0]?.sortKeyFrom).toBeUndefined() - expect(boundaries[0]?.sortKeyTo).toBeUndefined() - expect(boundaries[0]?.estimatedBytes).toBe(5 * GiB) - }) - - test('large partition produces multiple sub-chunks with sort key ranges', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 10000, bytesOnDisk: 30 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - ] - const sortKey: SortKeyInfo = { column: 'event_time', type: 'DateTime', category: 'datetime' } - const sortKeyRanges = new Map([ - ['202501', { min: '2025-01-01 00:00:00', max: '2025-01-31 00:00:00' }], - ]) - - const boundaries = buildChunkBoundaries({ - partitions, - maxChunkBytes: 10 * GiB, - sortKey, - sortKeyRanges, - }) - - expect(boundaries).toHaveLength(3) - for (const b of boundaries) { - expect(b.partitionId).toBe('202501') - expect(b.sortKeyFrom).toBeDefined() - expect(b.sortKeyTo).toBeDefined() - } - }) - - test('large partition without sort key produces single chunk', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 10000, bytesOnDisk: 30 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - ] - - const boundaries = buildChunkBoundaries({ - partitions, - maxChunkBytes: 10 * GiB, - }) - - expect(boundaries).toHaveLength(1) - expect(boundaries[0]?.estimatedBytes).toBe(30 * GiB) - }) - - test('mixed sizes produce correct boundary counts', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 500, bytesOnDisk: 5 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - { partitionId: '202502', rows: 5000, bytesOnDisk: 25 * GiB, minTime: '2025-02-01T00:00:00.000Z', maxTime: '2025-02-28T00:00:00.000Z' }, - ] - const sortKey: SortKeyInfo = { column: 'event_time', type: 'DateTime', category: 'datetime' } - const sortKeyRanges = new Map([ - ['202502', { min: '2025-02-01 00:00:00', max: '2025-02-28 00:00:00' }], - ]) - - const boundaries = buildChunkBoundaries({ - partitions, - maxChunkBytes: 10 * GiB, - sortKey, - sortKeyRanges, - }) - - // First partition: 5 GiB < 10 GiB -> 1 boundary - // Second partition: 25 GiB / 10 GiB = 3 sub-boundaries - expect(boundaries).toHaveLength(4) - - const p1 = boundaries.filter((b) => b.partitionId === '202501') - const p2 = boundaries.filter((b) => b.partitionId === '202502') - expect(p1).toHaveLength(1) - expect(p2).toHaveLength(3) - }) - - test('large partition with min === max sort key produces single chunk', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 10000, bytesOnDisk: 30 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - ] - const sortKey: SortKeyInfo = { column: 'event_type', type: 'String', category: 'string' } - const sortKeyRanges = new Map([ - ['202501', { min: 'click', max: 'click' }], - ]) - - const boundaries = buildChunkBoundaries({ - partitions, - maxChunkBytes: 10 * GiB, - sortKey, - sortKeyRanges, - }) - - expect(boundaries).toHaveLength(1) - expect(boundaries[0]?.partitionId).toBe('202501') - expect(boundaries[0]?.sortKeyFrom).toBeUndefined() - expect(boundaries[0]?.sortKeyTo).toBeUndefined() - }) - - test('numeric sort key produces numeric range sub-chunks', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 10000, bytesOnDisk: 20 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - ] - const sortKey: SortKeyInfo = { column: 'id', type: 'UInt64', category: 'numeric' } - const sortKeyRanges = new Map([ - ['202501', { min: '100', max: '200' }], - ]) - - const boundaries = buildChunkBoundaries({ - partitions, - maxChunkBytes: 10 * GiB, - sortKey, - sortKeyRanges, - }) - - expect(boundaries).toHaveLength(2) - expect(boundaries[0]?.sortKeyFrom).toBe('100') - expect(boundaries[0]?.sortKeyTo).toBe('150') - expect(boundaries[1]?.sortKeyFrom).toBe('150') - expect(boundaries[1]?.sortKeyTo).toBe('201') - }) -}) diff --git a/packages/plugin-backfill/src/chunking/build.ts b/packages/plugin-backfill/src/chunking/build.ts deleted file mode 100644 index cc6693b..0000000 --- a/packages/plugin-backfill/src/chunking/build.ts +++ /dev/null @@ -1,60 +0,0 @@ -import { splitSortKeyRange } from './splitter.js' -import type { ChunkBoundary, PartitionInfo, SortKeyInfo } from './types.js' - -export function buildChunkBoundaries(input: { - partitions: PartitionInfo[] - maxChunkBytes: number - sortKey?: SortKeyInfo - sortKeyRanges?: Map -}): ChunkBoundary[] { - const boundaries: ChunkBoundary[] = [] - - for (const partition of input.partitions) { - if (partition.bytesOnDisk <= input.maxChunkBytes) { - boundaries.push({ - partitionId: partition.partitionId, - estimatedBytes: partition.bytesOnDisk, - }) - } else if (input.sortKey && input.sortKeyRanges) { - const range = input.sortKeyRanges.get(partition.partitionId) - if (!range) { - // No range data — emit as single chunk - boundaries.push({ - partitionId: partition.partitionId, - estimatedBytes: partition.bytesOnDisk, - }) - continue - } - - // If min === max, splitting would produce empty sub-ranges; emit as single chunk - if (range.min === range.max) { - boundaries.push({ - partitionId: partition.partitionId, - estimatedBytes: partition.bytesOnDisk, - }) - continue - } - - const subCount = Math.ceil(partition.bytesOnDisk / input.maxChunkBytes) - const subRanges = splitSortKeyRange(input.sortKey.category, range.min, range.max, subCount) - const estimatedBytesPerSub = Math.ceil(partition.bytesOnDisk / subCount) - - for (const sub of subRanges) { - boundaries.push({ - partitionId: partition.partitionId, - sortKeyFrom: sub.from, - sortKeyTo: sub.to, - estimatedBytes: estimatedBytesPerSub, - }) - } - } else { - // No sort key info — emit as single chunk despite being oversized - boundaries.push({ - partitionId: partition.partitionId, - estimatedBytes: partition.bytesOnDisk, - }) - } - } - - return boundaries -} diff --git a/packages/plugin-backfill/src/chunking/e2e/constants.ts b/packages/plugin-backfill/src/chunking/e2e/constants.ts new file mode 100644 index 0000000..98e4738 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/e2e/constants.ts @@ -0,0 +1 @@ +export const TABLE_PREFIX = 'chkit_e2e_chunking' diff --git a/packages/plugin-backfill/src/chunking/e2e/seed-datasets.script.ts b/packages/plugin-backfill/src/chunking/e2e/seed-datasets.script.ts new file mode 100644 index 0000000..e665c5d --- /dev/null +++ b/packages/plugin-backfill/src/chunking/e2e/seed-datasets.script.ts @@ -0,0 +1,163 @@ +#!/usr/bin/env bun + +/** + * Seeds ClickHouse tables for smart-chunking E2E tests. + * + * Run once manually: + * bun run packages/plugin-backfill/src/chunking/e2e/seed.ts + * + * Requires CLICKHOUSE_HOST/CLICKHOUSE_URL + CLICKHOUSE_PASSWORD env vars. + * Creates tables if they don't exist, truncates them, and re-inserts data. + */ + +import { randomBytes } from 'node:crypto' +import { getRequiredEnv, createLiveExecutor } from '@chkit/clickhouse/e2e-testkit' + +import { TABLE_PREFIX } from './constants.js' + +interface DatasetConfig { + name: string + columns: string + orderBy: string + partitionBy: string + generate: () => Record[] +} + +function pad(bytes: number): string { + return randomBytes(bytes).toString('hex') +} + +function dayHour(day: number, hour: number): string { + return `2026-01-${String(day).padStart(2, '0')} ${String(hour).padStart(2, '0')}:00:00` +} + +export const datasets: DatasetConfig[] = [ + { + name: 'skewed_power_law', + columns: [ + 'tenant_id String', + 'seq UInt64', + 'event_time DateTime', + 'padding String', + ].join(', '), + orderBy: '(tenant_id, seq)', + partitionBy: 'toYYYYMM(event_time)', + generate() { + const rows: Record[] = [] + + // 80%: single dominant tenant — 8,000 rows + for (let i = 0; i < 8000; i++) { + rows.push({ + tenant_id: 'mega-corp', + seq: i, + event_time: dayHour(1 + (i % 28), i % 24), + padding: pad(512), + }) + } + + // 20%: 200 small tenants, 10 rows each — 2,000 rows + for (let t = 0; t < 200; t++) { + for (let i = 0; i < 10; i++) { + rows.push({ + tenant_id: `tenant-${String(t).padStart(4, '0')}`, + seq: i, + event_time: dayHour(1 + ((t * 10 + i) % 28), (t + i) % 24), + padding: pad(512), + }) + } + } + + return rows + }, + }, + { + name: 'multiple_hot_keys', + columns: [ + 'tenant_id String', + 'seq UInt64', + 'event_time DateTime', + 'padding String', + ].join(', '), + orderBy: '(tenant_id, seq)', + partitionBy: 'toYYYYMM(event_time)', + generate() { + const rows: Record[] = [] + + // 3 hot tenants, ~30% each — 3,000 rows each = 9,000 rows + for (const tenant of ['alpha-corp', 'beta-corp', 'gamma-corp']) { + for (let i = 0; i < 3000; i++) { + rows.push({ + tenant_id: tenant, + seq: i, + event_time: dayHour(1 + (i % 28), i % 24), + padding: pad(512), + }) + } + } + + // 10%: 100 small tenants, 10 rows each — 1,000 rows + for (let t = 0; t < 100; t++) { + for (let i = 0; i < 10; i++) { + rows.push({ + tenant_id: `small-${String(t).padStart(4, '0')}`, + seq: i, + event_time: dayHour(1 + ((t * 10 + i) % 28), (t + i) % 24), + padding: pad(512), + }) + } + } + + return rows + }, + }, +] + +const BATCH_SIZE = 5000 + +async function seed() { + const env = getRequiredEnv() + const executor = createLiveExecutor(env) + const db = env.clickhouseDatabase + + try { + for (const dataset of datasets) { + const table = `${TABLE_PREFIX}_${dataset.name}` + const fqn = `${db}.${table}` + console.log(`\n--- Seeding ${fqn} ---`) + + await executor.command(` + CREATE TABLE IF NOT EXISTS ${fqn} ( + ${dataset.columns} + ) ENGINE = MergeTree() + PARTITION BY ${dataset.partitionBy} + ORDER BY ${dataset.orderBy} + `) + console.log(' Table ensured.') + + await executor.command(`TRUNCATE TABLE ${fqn}`) + console.log(' Truncated.') + + const rows = dataset.generate() + for (let i = 0; i < rows.length; i += BATCH_SIZE) { + const batch = rows.slice(i, i + BATCH_SIZE) + await executor.insert({ table: fqn, values: batch }) + console.log(` Inserted ${Math.min(i + BATCH_SIZE, rows.length)} / ${rows.length} rows`) + } + + // Verify + const [result] = await executor.query<{ cnt: string }>( + `SELECT count() AS cnt FROM ${fqn} SETTINGS select_sequential_consistency = 1`, + ) + console.log(` Verified: ${result?.cnt} rows`) + } + } finally { + await executor.close() + } + + console.log('\nDone!') +} + +seed().catch((error) => { + console.error(error) + process.exit(1) +}) diff --git a/packages/plugin-backfill/src/chunking/e2e/smart-chunking.e2e.test.ts b/packages/plugin-backfill/src/chunking/e2e/smart-chunking.e2e.test.ts new file mode 100644 index 0000000..56aa47d --- /dev/null +++ b/packages/plugin-backfill/src/chunking/e2e/smart-chunking.e2e.test.ts @@ -0,0 +1,297 @@ +import { afterAll, beforeAll, describe, expect, test } from 'bun:test' + +import { createClient } from '@clickhouse/client' +import { createLiveExecutor, getRequiredEnv } from '@chkit/clickhouse/e2e-testkit' +import type { ClickHouseExecutor } from '@chkit/clickhouse' + +import { analyzeAndChunk } from '../analyze.js' +import { buildChunkExecutionSql, buildWhereClauseFromChunk } from '../sql.js' +import type { Chunk, ChunkPlan, PlannerQuery } from '../types.js' + +import { TABLE_PREFIX } from './constants.js' + +// --------------------------------------------------------------------------- +// Shared setup +// --------------------------------------------------------------------------- + +let executor: ClickHouseExecutor +let plannerQuery: PlannerQuery +let closePlannerClient: () => Promise +let db: string + +beforeAll(() => { + const env = getRequiredEnv() + executor = createLiveExecutor(env) + db = env.clickhouseDatabase + + // The planner runs parallel queries via pMap, which requires a sessionless + // client to avoid ClickHouse Cloud session locking errors. + const client = createClient({ + url: env.clickhouseUrl, + username: env.clickhouseUser, + password: env.clickhousePassword, + database: env.clickhouseDatabase, + clickhouse_settings: { wait_end_of_query: 1 }, + }) + + plannerQuery = async (sql: string, settings?: Record): Promise => { + const result = await client.query({ + query: sql, + format: 'JSONEachRow', + ...(settings ? { clickhouse_settings: settings } : {}), + }) + return result.json() + } + closePlannerClient = () => client.close() +}) + +afterAll(async () => { + await closePlannerClient?.() + await executor?.close() +}) + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function strategyIds(chunk: Chunk): string[] { + return chunk.analysis.lineage.map((step) => step.strategyId) +} + +async function requireSeededTable(table: string): Promise { + const [result] = await executor.query<{ cnt: string }>( + `SELECT count() AS cnt FROM ${db}.${table} SETTINGS select_sequential_consistency = 1`, + ) + const count = Number(result?.cnt ?? 0) + if (count === 0) { + throw new Error( + `Table ${db}.${table} is empty. Run the seed script first:\n` + + ` bun run seed:env`, + ) + } + return count +} + +async function getPartitionUncompressedBytes(table: string): Promise { + const rows = await executor.query<{ total: string }>(` + SELECT toString(sum(data_uncompressed_bytes)) AS total + FROM system.parts + WHERE database = '${db}' AND table = '${table}' AND active = 1 + SETTINGS select_sequential_consistency = 1 + `) + return Number(rows[0]?.total ?? 0) +} + +async function chunkPlan(table: string, targetChunkBytes: number): Promise { + return analyzeAndChunk({ + database: db, + table, + targetChunkBytes, + query: plannerQuery, + querySettings: { enable_parallel_replicas: 0 }, + }) +} + +function buildSql(plan: ChunkPlan, chunk: Chunk): string { + return buildChunkExecutionSql({ + planId: plan.planId, + chunk, + target: `${plan.table.database}.${plan.table.table}`, + sourceTarget: `${plan.table.database}.${plan.table.table}`, + table: plan.table, + }) +} + +// --------------------------------------------------------------------------- +// Scenario 1: Skewed Power Law Distribution +// +// 80% of rows belong to a single tenant ("mega-corp"), 20% spread across +// 200 small tenants. Sort key: (tenant_id, seq). +// +// Expected behavior: +// - The system detects "mega-corp" as a hot key +// - mega-corp chunks are split on the secondary dimension (seq) +// - Small tenants are grouped into larger chunks +// - All rows are covered, no gaps or overlaps +// --------------------------------------------------------------------------- + +describe('e2e: skewed power law', () => { + const table = `${TABLE_PREFIX}_skewed_power_law` + let plan: ChunkPlan + let totalRows: number + + beforeAll(async () => { + totalRows = await requireSeededTable(table) + const uncompressedBytes = await getPartitionUncompressedBytes(table) + + // Target ~5 chunks + const targetChunkBytes = Math.floor(uncompressedBytes / 5) + plan = await chunkPlan(table, targetChunkBytes) + }, 60_000) + + test('produces multiple chunks', () => { + expect(plan.chunks.length).toBeGreaterThan(1) + }) + + test('detects mega-corp as a focused (hot) key', () => { + const focused = plan.chunks.filter( + (c) => c.analysis.focusedValue?.value === 'mega-corp', + ) + expect(focused.length).toBeGreaterThan(0) + }) + + test('mega-corp chunks are split on the secondary dimension (seq)', () => { + const megaCorpChunks = plan.chunks.filter( + (c) => c.analysis.focusedValue?.value === 'mega-corp', + ) + expect(megaCorpChunks.length).toBeGreaterThan(1) + + // Each mega-corp chunk should have ranges on both dimensions + for (const chunk of megaCorpChunks) { + const dims = new Set(chunk.ranges.map((r) => r.dimensionIndex)) + expect(dims.has(0)).toBe(true) // tenant_id + expect(dims.has(1)).toBe(true) // seq + } + }) + + test('mega-corp chunk boundaries on dim 1 are contiguous', () => { + const megaCorpChunks = plan.chunks + .filter((c) => c.analysis.focusedValue?.value === 'mega-corp') + .sort((a, b) => { + const aFrom = a.ranges.find((r) => r.dimensionIndex === 1)?.from ?? '' + const bFrom = b.ranges.find((r) => r.dimensionIndex === 1)?.from ?? '' + return String(aFrom).localeCompare(String(bFrom)) + }) + + for (let i = 1; i < megaCorpChunks.length; i++) { + const prev = megaCorpChunks[i - 1]?.ranges.find((r) => r.dimensionIndex === 1) + const curr = megaCorpChunks[i]?.ranges.find((r) => r.dimensionIndex === 1) + if (prev?.to !== undefined && curr?.from !== undefined) { + expect(prev.to).toBe(curr.from) + } + } + }) + + test('estimated row sum is within 20% of actual count', () => { + const estimatedTotal = plan.chunks.reduce((sum, c) => sum + c.estimate.rows, 0) + const ratio = estimatedTotal / totalRows + expect(ratio).toBeGreaterThanOrEqual(0.8) + expect(ratio).toBeLessThanOrEqual(1.2) + }) + + test('no chunk exceeds 2x the target size', () => { + for (const chunk of plan.chunks) { + expect(chunk.estimate.bytesUncompressed).toBeLessThan(plan.targetChunkBytes * 2) + } + }) + + test('every chunk produces valid execution SQL', () => { + for (const chunk of plan.chunks) { + const sql = buildSql(plan, chunk) + expect(sql).toContain('INSERT INTO') + expect(sql).toContain('_partition_id') + // mega-corp chunks should reference both sort key columns + if (chunk.analysis.focusedValue?.value === 'mega-corp') { + expect(sql).toContain('tenant_id >=') + expect(sql).toContain('seq >=') + } + } + }) + + test('executing all chunk queries returns the full row count', async () => { + let totalCounted = 0 + for (const chunk of plan.chunks) { + const where = buildWhereClauseFromChunk(chunk, plan.table) + const countSql = `SELECT count() AS cnt FROM ${db}.${table} WHERE ${where}` + const [row] = await executor.query<{ cnt: string }>(countSql) + totalCounted += Number(row?.cnt ?? 0) + } + + expect(totalCounted).toBe(totalRows) + }, 60_000) +}) + +// --------------------------------------------------------------------------- +// Scenario 2: Multiple Hot Keys +// +// Three tenants each hold ~30% of rows ("alpha-corp", "beta-corp", +// "gamma-corp"), with ~10% spread across 100 small tenants. +// Sort key: (tenant_id, seq). +// +// Expected behavior: +// - Each hot tenant is detected as a focused value +// - Each hot tenant is independently split on dim 1 (seq) +// - Small tenants are covered by non-focused chunks +// - All rows are accounted for with no gaps +// --------------------------------------------------------------------------- + +describe('e2e: multiple hot keys', () => { + const table = `${TABLE_PREFIX}_multiple_hot_keys` + const hotTenants = ['alpha-corp', 'beta-corp', 'gamma-corp'] + let plan: ChunkPlan + let totalRows: number + + beforeAll(async () => { + totalRows = await requireSeededTable(table) + const uncompressedBytes = await getPartitionUncompressedBytes(table) + + // Target ~10 chunks so each hot tenant (~30% = ~3x target) clearly needs splitting + const targetChunkBytes = Math.floor(uncompressedBytes / 10) + plan = await chunkPlan(table, targetChunkBytes) + }, 60_000) + + test('produces multiple chunks', () => { + expect(plan.chunks.length).toBeGreaterThan(3) + }) + + test('detects all three hot tenants as focused values', () => { + const focusedValues = new Set( + plan.chunks + .map((c) => c.analysis.focusedValue?.value) + .filter(Boolean), + ) + for (const tenant of hotTenants) { + expect(focusedValues.has(tenant)).toBe(true) + } + }) + + test('each hot tenant has chunks with ranges on both dimensions', () => { + for (const tenant of hotTenants) { + const tenantChunks = plan.chunks.filter( + (c) => c.analysis.focusedValue?.value === tenant, + ) + expect(tenantChunks.length).toBeGreaterThanOrEqual(1) + + for (const chunk of tenantChunks) { + const dims = new Set(chunk.ranges.map((r) => r.dimensionIndex)) + expect(dims.has(0)).toBe(true) // tenant_id + expect(dims.has(1)).toBe(true) // seq + } + } + }) + + test('estimated row sum is within 20% of actual count', () => { + const estimatedTotal = plan.chunks.reduce((sum, c) => sum + c.estimate.rows, 0) + const ratio = estimatedTotal / totalRows + expect(ratio).toBeGreaterThanOrEqual(0.8) + expect(ratio).toBeLessThanOrEqual(1.2) + }) + + test('no chunk exceeds 2x the target size', () => { + for (const chunk of plan.chunks) { + expect(chunk.estimate.bytesUncompressed).toBeLessThan(plan.targetChunkBytes * 2) + } + }) + + test('executing all chunk queries returns the full row count', async () => { + let totalCounted = 0 + for (const chunk of plan.chunks) { + const where = buildWhereClauseFromChunk(chunk, plan.table) + const countSql = `SELECT count() AS cnt FROM ${db}.${table} WHERE ${where}` + const [row] = await executor.query<{ cnt: string }>(countSql) + totalCounted += Number(row?.cnt ?? 0) + } + + expect(totalCounted).toBe(totalRows) + }, 60_000) +}) diff --git a/packages/plugin-backfill/src/chunking/introspect.test.ts b/packages/plugin-backfill/src/chunking/introspect.test.ts deleted file mode 100644 index 431872c..0000000 --- a/packages/plugin-backfill/src/chunking/introspect.test.ts +++ /dev/null @@ -1,234 +0,0 @@ -import { describe, expect, test } from 'bun:test' - -import { introspectTable, queryPartitionInfo, querySortKeyInfo, querySortKeyRanges } from './introspect.js' - -describe('queryPartitionInfo', () => { - test('maps system.parts rows to PartitionInfo array', async () => { - const mockRows = [ - { partition_id: '202501', total_rows: '1000', total_bytes: '5000000', min_time: '2025-01-01 00:00:00', max_time: '2025-01-31 23:59:59' }, - { partition_id: '202502', total_rows: '2000', total_bytes: '8000000', min_time: '2025-02-01 00:00:00', max_time: '2025-02-28 23:59:59' }, - ] - - const result = await queryPartitionInfo({ - database: 'default', - table: 'events', - query: async () => mockRows as never, - }) - - expect(result).toHaveLength(2) - expect(result[0]?.partitionId).toBe('202501') - expect(result[0]?.rows).toBe(1000) - expect(result[0]?.bytesOnDisk).toBe(5000000) - expect(result[1]?.partitionId).toBe('202502') - expect(result[1]?.rows).toBe(2000) - }) - - test('filters out partitions before --from', async () => { - const mockRows = [ - { partition_id: '202501', total_rows: '1000', total_bytes: '5000000', min_time: '2025-01-01 00:00:00', max_time: '2025-01-31 23:59:59' }, - { partition_id: '202503', total_rows: '3000', total_bytes: '9000000', min_time: '2025-03-01 00:00:00', max_time: '2025-03-31 23:59:59' }, - ] - - const result = await queryPartitionInfo({ - database: 'default', - table: 'events', - from: '2025-02-01T00:00:00.000Z', - query: async () => mockRows as never, - }) - - expect(result).toHaveLength(1) - expect(result[0]?.partitionId).toBe('202503') - }) - - test('filters out partitions at or after --to', async () => { - const mockRows = [ - { partition_id: '202501', total_rows: '1000', total_bytes: '5000000', min_time: '2025-01-01 00:00:00', max_time: '2025-01-31 23:59:59' }, - { partition_id: '202503', total_rows: '3000', total_bytes: '9000000', min_time: '2025-03-01 00:00:00', max_time: '2025-03-31 23:59:59' }, - ] - - const result = await queryPartitionInfo({ - database: 'default', - table: 'events', - to: '2025-03-01T00:00:00.000Z', - query: async () => mockRows as never, - }) - - expect(result).toHaveLength(1) - expect(result[0]?.partitionId).toBe('202501') - }) -}) - -describe('querySortKeyInfo', () => { - test('returns sort key info for table with DateTime sorting key', async () => { - const query = async (sql: string) => { - if (sql.includes('system.tables')) { - return [{ sorting_key: 'event_time' }] as T[] - } - if (sql.includes('system.columns')) { - return [{ type: 'DateTime' }] as T[] - } - return [] as T[] - } - - const result = await querySortKeyInfo({ - database: 'default', - table: 'events', - query, - }) - - expect(result).toBeDefined() - expect(result?.column).toBe('event_time') - expect(result?.type).toBe('DateTime') - expect(result?.category).toBe('datetime') - }) - - test('returns numeric category for Int64 sorting key', async () => { - const query = async (sql: string) => { - if (sql.includes('system.tables')) return [{ sorting_key: 'id' }] as T[] - if (sql.includes('system.columns')) return [{ type: 'Int64' }] as T[] - return [] as T[] - } - - const result = await querySortKeyInfo({ database: 'default', table: 'events', query }) - - expect(result?.category).toBe('numeric') - }) - - test('returns string category for String sorting key', async () => { - const query = async (sql: string) => { - if (sql.includes('system.tables')) return [{ sorting_key: 'name' }] as T[] - if (sql.includes('system.columns')) return [{ type: 'String' }] as T[] - return [] as T[] - } - - const result = await querySortKeyInfo({ database: 'default', table: 'events', query }) - - expect(result?.category).toBe('string') - }) - - test('extracts column name from function expression', async () => { - const query = async (sql: string) => { - if (sql.includes('system.tables')) return [{ sorting_key: 'toDate(event_time)' }] as T[] - if (sql.includes('system.columns')) return [{ type: 'DateTime' }] as T[] - return [] as T[] - } - - const result = await querySortKeyInfo({ database: 'default', table: 'events', query }) - - expect(result?.column).toBe('event_time') - }) - - test('returns undefined when table has no sorting key', async () => { - const query = async (sql: string) => { - if (sql.includes('system.tables')) return [{ sorting_key: '' }] as T[] - return [] as T[] - } - - const result = await querySortKeyInfo({ database: 'default', table: 'events', query }) - - expect(result).toBeUndefined() - }) - - test('returns first column from multi-column sorting key', async () => { - const query = async (sql: string) => { - if (sql.includes('system.tables')) return [{ sorting_key: 'event_time, id' }] as T[] - if (sql.includes('system.columns')) return [{ type: 'DateTime' }] as T[] - return [] as T[] - } - - const result = await querySortKeyInfo({ database: 'default', table: 'events', query }) - - expect(result?.column).toBe('event_time') - }) -}) - -describe('querySortKeyRanges', () => { - test('returns min/max per partition', async () => { - const query = async () => { - return [ - { partition_id: '202501', min_val: '2025-01-01 00:00:00', max_val: '2025-01-31 23:59:59' }, - { partition_id: '202502', min_val: '2025-02-01 00:00:00', max_val: '2025-02-28 23:59:59' }, - ] as T[] - } - - const result = await querySortKeyRanges({ - database: 'default', - table: 'events', - sortKeyColumn: 'event_time', - partitionIds: ['202501', '202502'], - query, - }) - - expect(result.size).toBe(2) - expect(result.get('202501')?.min).toBe('2025-01-01 00:00:00') - expect(result.get('202502')?.max).toBe('2025-02-28 23:59:59') - }) - - test('returns empty map for empty partition list', async () => { - const query = async () => [] as T[] - - const result = await querySortKeyRanges({ - database: 'default', - table: 'events', - sortKeyColumn: 'event_time', - partitionIds: [], - query, - }) - - expect(result.size).toBe(0) - }) -}) - -describe('introspectTable', () => { - test('returns partitions and sort key in a single call', async () => { - const query = async (sql: string) => { - if (sql.includes('system.parts')) { - return [ - { partition_id: '202501', total_rows: '1000', total_bytes: '5000000', min_time: '2025-01-01 00:00:00', max_time: '2025-01-31 23:59:59' }, - ] as T[] - } - if (sql.includes('system.tables')) { - return [{ sorting_key: 'event_time' }] as T[] - } - if (sql.includes('system.columns')) { - return [{ type: 'DateTime' }] as T[] - } - return [] as T[] - } - - const result = await introspectTable({ - database: 'default', - table: 'events', - query, - }) - - expect(result.partitions).toHaveLength(1) - expect(result.partitions[0]?.partitionId).toBe('202501') - expect(result.sortKey).toBeDefined() - expect(result.sortKey?.column).toBe('event_time') - expect(result.sortKey?.category).toBe('datetime') - }) - - test('returns undefined sortKey when table has no sorting key', async () => { - const query = async (sql: string) => { - if (sql.includes('system.parts')) { - return [ - { partition_id: '202501', total_rows: '1000', total_bytes: '5000000', min_time: '2025-01-01 00:00:00', max_time: '2025-01-31 23:59:59' }, - ] as T[] - } - if (sql.includes('system.tables')) { - return [{ sorting_key: '' }] as T[] - } - return [] as T[] - } - - const result = await introspectTable({ - database: 'default', - table: 'events', - query, - }) - - expect(result.partitions).toHaveLength(1) - expect(result.sortKey).toBeUndefined() - }) -}) diff --git a/packages/plugin-backfill/src/chunking/introspect.ts b/packages/plugin-backfill/src/chunking/introspect.ts deleted file mode 100644 index e383f6f..0000000 --- a/packages/plugin-backfill/src/chunking/introspect.ts +++ /dev/null @@ -1,146 +0,0 @@ -import type { PartitionInfo, SortKeyInfo } from './types.js' - -const NUMERIC_TYPES = new Set([ - 'Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', - 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', - 'Float32', 'Float64', -]) - -const DATETIME_TYPES = new Set(['Date', 'Date32', 'DateTime', 'DateTime64']) - -function classifySortKeyType(type: string): SortKeyInfo['category'] { - if (NUMERIC_TYPES.has(type)) return 'numeric' - if (DATETIME_TYPES.has(type)) return 'datetime' - if (type.startsWith('DateTime64(')) return 'datetime' - if (type.startsWith("DateTime('")) return 'datetime' - return 'string' -} - -export async function queryPartitionInfo(input: { - database: string - table: string - from?: string - to?: string - query: (sql: string) => Promise -}): Promise { - // Force replica sync on the target table before reading system.parts. - // select_sequential_consistency is only effective on user tables, not system - // tables, so this preliminary query ensures the replica has caught up with - // all pending writes before we inspect part metadata. - await input.query( - `SELECT 1 FROM ${input.database}.${input.table} LIMIT 1 SETTINGS select_sequential_consistency = 1` - ) - - const rows = await input.query<{ - partition_id: string - total_rows: string - total_bytes: string - min_time: string - max_time: string - }>( - `SELECT - partition_id, - toString(sum(rows)) AS total_rows, - toString(sum(bytes_on_disk)) AS total_bytes, - toString(min(min_time)) AS min_time, - toString(max(max_time)) AS max_time -FROM system.parts -WHERE database = '${input.database}' - AND table = '${input.table}' - AND active = 1 -GROUP BY partition_id -ORDER BY partition_id -SETTINGS select_sequential_consistency = 1` - ) - - const partitions: PartitionInfo[] = rows.map((row) => ({ - partitionId: row.partition_id, - rows: Number(row.total_rows), - bytesOnDisk: Number(row.total_bytes), - minTime: new Date(row.min_time).toISOString(), - maxTime: new Date(row.max_time).toISOString(), - })) - - return partitions.filter((p) => { - if (input.from && p.maxTime < input.from) return false - if (input.to && p.minTime >= input.to) return false - return true - }) -} - -export async function querySortKeyInfo(input: { - database: string - table: string - query: (sql: string) => Promise -}): Promise { - const tableRows = await input.query<{ sorting_key: string }>( - `SELECT sorting_key FROM system.tables WHERE database = '${input.database}' AND name = '${input.table}'` - ) - - const sortingKey = tableRows[0]?.sorting_key - if (!sortingKey) return undefined - - // Parse first column from sorting key (may have expressions like "toDate(event_time)") - const firstColumn = sortingKey.split(',')[0]?.trim() - if (!firstColumn) return undefined - - // If it's a function call like toDate(col), extract the column name - const match = firstColumn.match(/^\w+\((\w+)\)$/) - const columnName = match ? match[1] : firstColumn - if (!columnName) return undefined - - const columnRows = await input.query<{ type: string }>( - `SELECT type FROM system.columns WHERE database = '${input.database}' AND table = '${input.table}' AND name = '${columnName}'` - ) - - const type = columnRows[0]?.type - if (!type) return undefined - - return { - column: columnName, - type, - category: classifySortKeyType(type), - } -} - -export async function querySortKeyRanges(input: { - database: string - table: string - sortKeyColumn: string - partitionIds: string[] - query: (sql: string) => Promise -}): Promise> { - if (input.partitionIds.length === 0) return new Map() - - const inList = input.partitionIds.map((id) => `'${id}'`).join(', ') - const rows = await input.query<{ - partition_id: string - min_val: string - max_val: string - }>( - `SELECT _partition_id AS partition_id, toString(min(${input.sortKeyColumn})) AS min_val, toString(max(${input.sortKeyColumn})) AS max_val FROM ${input.database}.${input.table} WHERE _partition_id IN (${inList}) GROUP BY _partition_id SETTINGS select_sequential_consistency = 1` - ) - - const result = new Map() - for (const row of rows) { - result.set(row.partition_id, { min: row.min_val, max: row.max_val }) - } - return result -} - -export async function introspectTable(input: { - database: string - table: string - from?: string - to?: string - query: (sql: string) => Promise -}): Promise<{ partitions: PartitionInfo[]; sortKey?: SortKeyInfo }> { - const partitions = await queryPartitionInfo(input) - const sortKey = await querySortKeyInfo({ - database: input.database, - table: input.table, - query: input.query, - }) - - return { partitions, sortKey } -} diff --git a/packages/plugin-backfill/src/chunking/partition-slices.ts b/packages/plugin-backfill/src/chunking/partition-slices.ts new file mode 100644 index 0000000..6baa2f0 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/partition-slices.ts @@ -0,0 +1,154 @@ +import type { + ChunkEstimate, + EstimateConfidence, + EstimateReason, + Partition, + PartitionSlice, + ChunkDerivationStep, + ChunkRange, +} from './types.js' + +export function buildRootSlice(partition: Partition): PartitionSlice { + return { + partitionId: partition.partitionId, + ranges: [], + estimate: { + rows: partition.rows, + bytesCompressed: partition.bytesCompressed, + bytesUncompressed: partition.bytesUncompressed, + confidence: 'high', + reason: 'partition-metadata', + }, + analysis: { + lineage: [], + }, + } +} + +export function buildSliceEstimate( + partition: Partition, + rows: number, + confidence: EstimateConfidence, + reason: EstimateReason, +): ChunkEstimate { + const bytesCompressed = partition.rows > 0 + ? Math.round((rows / partition.rows) * partition.bytesCompressed) + : 0 + const bytesUncompressed = partition.rows > 0 + ? Math.round((rows / partition.rows) * partition.bytesUncompressed) + : 0 + + return { + rows, + bytesCompressed, + bytesUncompressed, + confidence, + reason, + } +} + +export function buildSliceFromRows( + partition: Partition, + input: { + ranges: ChunkRange[] + rows: number + focusedValue?: PartitionSlice['analysis']['focusedValue'] + confidence: EstimateConfidence + reason: EstimateReason + lineage: ChunkDerivationStep[] + }, +): PartitionSlice { + return { + partitionId: partition.partitionId, + ranges: input.ranges, + estimate: buildSliceEstimate(partition, input.rows, input.confidence, input.reason), + analysis: { + focusedValue: input.focusedValue, + lineage: input.lineage, + }, + } +} + +export function getTargetChunkRows( + partition: Partition, + targetChunkBytes: number, +): number { + if (partition.bytesUncompressed <= 0) return partition.rows + return (targetChunkBytes * partition.rows) / partition.bytesUncompressed +} + +export function mergeAdjacentSlices( + slices: PartitionSlice[], + targetChunkBytes: number, +): PartitionSlice[] { + if (slices.length <= 1) return slices + + const merged: PartitionSlice[] = [] + let current: PartitionSlice | undefined + + for (const slice of slices) { + if (!current) { + current = slice + continue + } + + const canMerge = + !current.analysis.focusedValue && + !slice.analysis.focusedValue && + haveSameTrailingRanges(current.ranges, slice.ranges) && + current.estimate.bytesUncompressed + slice.estimate.bytesUncompressed <= targetChunkBytes * 1.1 + + if (!canMerge) { + merged.push(current) + current = slice + continue + } + + current = { + ...current, + ranges: mergeRanges(current.ranges, slice.ranges), + estimate: { + ...current.estimate, + rows: current.estimate.rows + slice.estimate.rows, + bytesCompressed: current.estimate.bytesCompressed + slice.estimate.bytesCompressed, + bytesUncompressed: current.estimate.bytesUncompressed + slice.estimate.bytesUncompressed, + + }, + } + } + + if (current) merged.push(current) + return merged +} + +function mergeRanges(left: ChunkRange[], right: ChunkRange[]): ChunkRange[] { + return left.map((leftRange) => { + const rightRange = right.find((candidate) => candidate.dimensionIndex === leftRange.dimensionIndex) + return rightRange === undefined + ? leftRange + : { + dimensionIndex: leftRange.dimensionIndex, + from: leftRange.from, + to: rightRange.to, + } + }) +} + +function haveSameTrailingRanges(left: ChunkRange[], right: ChunkRange[]): boolean { + if (left.length !== right.length) return false + + let differingDimensions = 0 + + for (const leftRange of left) { + const rightRange = right.find((candidate) => candidate.dimensionIndex === leftRange.dimensionIndex) + if (!rightRange) return false + + const same = leftRange.from === rightRange.from && leftRange.to === rightRange.to + if (!same) { + differingDimensions += 1 + if (leftRange.to !== rightRange.from) return false + } + } + + return differingDimensions <= 1 +} diff --git a/packages/plugin-backfill/src/chunking/planner.ts b/packages/plugin-backfill/src/chunking/planner.ts new file mode 100644 index 0000000..23e8da8 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/planner.ts @@ -0,0 +1,546 @@ +import pMap from 'p-map' +import { + describeSqlContext, + describeSqlOperation, + formatBytes, + getBackfillLogger, + SLOW_CLICKHOUSE_QUERY_MS, + SLOW_CLICKHOUSE_QUERY_REPEAT_INITIAL_MS, + SLOW_CLICKHOUSE_QUERY_REPEAT_MAX_MS, + summarizeSql, +} from '../logging.js' +import { buildRootSlice, mergeAdjacentSlices } from './partition-slices.js' +import { introspectPartitions, introspectSortKeys } from './services/metadata-source.js' +import { getRowProbeStrategy, getSortKeyRange, parsePlannerDateTime } from './services/row-probe.js' +import { splitSliceWithEqualWidthRanges } from './strategies/equal-width-split.js' +import { buildSingleChunkPartition } from './strategies/metadata-single-chunk.js' +import { + findQuantileBoundaryOnDimension, + splitSliceWithQuantiles, +} from './strategies/quantile-range-split.js' +import { refinePartitionSlices } from './strategies/refinement.js' +import { splitSliceWithGroupByKey } from './strategies/group-by-key-split.js' +import { buildRootStringUpperBound, splitSliceWithStringPrefixes } from './strategies/string-prefix-split.js' +import { splitSliceWithTemporalBuckets } from './strategies/temporal-bucket-split.js' +import { getCandidateDimensions } from './strategy-policy.js' +import type { + Chunk, + ChunkPlan, + GenerateChunkPlanInput, + Partition, + PartitionBuildResult, + PartitionSlice, + PlannerContext, + PlannerQuery, + SortKey, + TableProfile, +} from './types.js' +import { generateChunkId, generatePlanId } from './utils/ids.js' +import { getChunkRange, isExactChunkRange, replaceChunkRange } from './utils/ranges.js' + +const MAX_SPLIT_DEPTH_MULTIPLIER = 3 +const STOP_SPLIT_FUZZ_FACTOR = 1.5 +const logger = getBackfillLogger('chunking', 'planner') +const queryLogger = getBackfillLogger('chunking', 'clickhouse') + +export async function generateChunkPlan(input: GenerateChunkPlanInput): Promise { + const planStartedAt = performance.now() + const context: PlannerContext = { + database: input.database, + table: input.table, + from: input.from, + to: input.to, + targetChunkBytes: input.targetChunkBytes, + query: createTimedPlannerQuery(input), + querySettings: input.querySettings, + rowProbeStrategy: input.rowProbeStrategy ?? 'count', + } + + logger.info( + `starting chunk plan for ${input.database}.${input.table} (target chunk size ${formatBytes(input.targetChunkBytes)}, row probe ${context.rowProbeStrategy})` + ) + + const introspectionStartedAt = performance.now() + const partitions = await introspectPartitions(context) + const sortKeys = await introspectSortKeys(context) + const table: TableProfile = { + database: input.database, + table: input.table, + sortKeys, + } + const planId = generatePlanId() + + logger.info( + `introspection completed for ${input.database}.${input.table}: ${partitions.length} partitions, ${partitions.filter((partition) => partition.bytesUncompressed > context.targetChunkBytes).length} oversized partitions, ${sortKeys.length} sort keys (${Math.round(performance.now() - introspectionStartedAt)}ms)` + ) + + const slices: PartitionSlice[] = [] + const plannedPartitions: Partition[] = [] + for (const partition of partitions) { + const result = await planPartition(context, partition, table) + slices.push(...result.slices) + plannedPartitions.push({ + ...partition, + diagnostics: result.diagnostics, + }) + } + + const chunks = assignChunkIds(planId, slices) + const chunkBytes = chunks.map((chunk) => chunk.estimate.bytesUncompressed) + const stats = { + totalPartitions: partitions.length, + oversizedPartitions: partitions.filter((partition) => partition.bytesUncompressed > context.targetChunkBytes).length, + focusedChunks: chunks.filter((chunk) => chunk.analysis.focusedValue !== undefined).length, + totalChunks: chunks.length, + avgChunkBytes: chunkBytes.length > 0 + ? Math.round(chunkBytes.reduce((sum, value) => sum + value, 0) / chunkBytes.length) + : 0, + maxChunkBytes: chunkBytes.length > 0 ? Math.max(...chunkBytes) : 0, + minChunkBytes: chunkBytes.length > 0 ? Math.min(...chunkBytes) : 0, + } + + logger.info( + `finished chunk plan for ${input.database}.${input.table}: ${chunks.length} chunks across ${partitions.length} partitions, ${formatBytes(partitions.reduce((sum, partition) => sum + partition.bytesUncompressed, 0))} uncompressed (${Math.round(performance.now() - planStartedAt)}ms)` + ) + + return { + planId, + generatedAt: new Date().toISOString(), + rowProbeStrategy: getRowProbeStrategy(context), + targetChunkBytes: context.targetChunkBytes, + table, + partitions: plannedPartitions, + chunks, + totalRows: partitions.reduce((sum, partition) => sum + partition.rows, 0), + totalBytesCompressed: partitions.reduce((sum, partition) => sum + partition.bytesCompressed, 0), + totalBytesUncompressed: partitions.reduce((sum, partition) => sum + partition.bytesUncompressed, 0), + stats, + } +} + +async function planPartition( + context: PlannerContext, + partition: Partition, + table: TableProfile, +): Promise { + const startedAt = performance.now() + logger.info( + `planning partition ${partition.partitionId} (${partition.rows.toLocaleString()} rows, ${formatBytes(partition.bytesUncompressed)} uncompressed, target ${formatBytes(context.targetChunkBytes)})` + ) + + if (partition.bytesUncompressed <= context.targetChunkBytes || table.sortKeys.length === 0) { + const refined = await refinePartitionSlices( + context, + partition, + buildSingleChunkPartition(partition), + table.sortKeys, + false + ) + + logger.info( + `kept partition ${partition.partitionId} as a single chunk (${Math.round(performance.now() - startedAt)}ms, ${partition.bytesUncompressed <= context.targetChunkBytes ? 'within target size' : 'no sort keys available'})` + ) + + return refined + } + + const rootSlice = buildRootSlice(partition) + const splitSlices = await splitSliceRecursively(context, partition, rootSlice, table.sortKeys, 0) + const mergedSlices = mergeAdjacentSlices(splitSlices, context.targetChunkBytes) + const usedDistributionFallback = mergedSlices.some((slice) => + slice.estimate.reason === 'string-prefix-distribution' || + slice.estimate.reason === 'group-by-key-distribution' || + slice.estimate.reason === 'temporal-distribution' || + slice.estimate.reason === 'equal-width-distribution' + ) + + logger.debug( + `partition ${partition.partitionId} produced ${splitSlices.length} candidate slices before refinement (${mergedSlices.length} after merge, distribution fallback ${usedDistributionFallback ? 'used' : 'not used'})` + ) + + const refined = await refinePartitionSlices( + context, + partition, + mergedSlices, + table.sortKeys, + usedDistributionFallback + ) + + logger.info( + `finished partition ${partition.partitionId}: ${refined.slices.length} chunks (${Math.round(performance.now() - startedAt)}ms)` + ) + + return refined +} + +async function splitSliceRecursively( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], + depth: number, +): Promise { + if (slice.estimate.bytesUncompressed <= context.targetChunkBytes * STOP_SPLIT_FUZZ_FACTOR) { + logger.debug( + `stopped splitting slice for partition ${partition.partitionId} at depth ${depth}: ${formatBytes(slice.estimate.bytesUncompressed)} is within threshold ${formatBytes(Math.round(context.targetChunkBytes * STOP_SPLIT_FUZZ_FACTOR))}` + ) + return [slice] + } + + if (depth >= sortKeys.length * MAX_SPLIT_DEPTH_MULTIPLIER) { + logger.debug( + `stopped splitting slice for partition ${partition.partitionId}: reached max depth ${sortKeys.length * MAX_SPLIT_DEPTH_MULTIPLIER}` + ) + return [slice] + } + + const children = await splitOversizedSlice(context, partition, slice, sortKeys, depth) + if (children.length <= 1) { + logger.debug(`slice could not be split further for partition ${partition.partitionId} at depth ${depth}`) + return [slice] + } + + const finalized: PartitionSlice[] = [] + for (const child of children) { + finalized.push(...(await splitSliceRecursively(context, partition, child, sortKeys, depth + 1))) + } + + return finalized +} + +async function splitOversizedSlice( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], + depth: number, +): Promise { + const candidateDimensions = getCandidateDimensions(sortKeys, slice) + + logger.debug( + `attempting oversized slice split for partition ${partition.partitionId} at depth ${depth} (${formatBytes(slice.estimate.bytesUncompressed)} uncompressed across ${candidateDimensions.length} candidate dimensions)` + ) + + for (const dimensionIndex of candidateDimensions) { + const preparedSlice = await hydrateSliceRange(context, slice, sortKeys, dimensionIndex) + if (!preparedSlice) continue + + const sortKey = sortKeys[dimensionIndex] + if (!sortKey) continue + + const rootLike = depth === 0 + const focusedValue = findFocusedValue(preparedSlice, sortKeys) + + logger.debug( + `trying split dimension ${dimensionIndex} on ${partition.partitionId} using ${sortKey.name} (${sortKey.category})` + ) + + if (sortKey.category === 'string') { + if (rootLike) { + // First pass: equal-width EXPLAIN ESTIMATE (fast, metadata-only) + const estimateSlices = await splitWithEqualWidthEstimate(context, partition, preparedSlice, sortKeys, dimensionIndex) + if (isEffectiveSplit(preparedSlice, estimateSlices)) { + logger.debug(`equal-width estimate split succeeded for partition ${partition.partitionId}: ${estimateSlices.length} slices`) + return applyFocusedValue(estimateSlices, focusedValue) + } + } else { + // Refinement pass: full GROUP BY key to detect hot keys directly + const keySlices = await splitSliceWithGroupByKey(context, partition, preparedSlice, sortKeys, dimensionIndex) + if (keySlices && isEffectiveSplit(preparedSlice, keySlices)) { + logger.debug(`group-by-key split succeeded for partition ${partition.partitionId}: ${keySlices.length} slices`) + return applyFocusedValue(keySlices, focusedValue) + } + + // Single hot key: narrow the range and re-enter dispatch so focusedValue is detected + if (keySlices?.length === 1 && keySlices[0]?.analysis.focusedValue) { + const refined = keySlices[0] + const currentRange = getChunkRange(preparedSlice, dimensionIndex) + const refinedRange = getChunkRange(refined, dimensionIndex) + if (currentRange.from !== refinedRange.from || currentRange.to !== refinedRange.to) { + logger.debug(`narrowed single hot key for partition ${partition.partitionId}, re-entering dispatch`) + return splitOversizedSlice(context, partition, refined, sortKeys, depth) + } + } + + // Fallback: GROUP BY prefix when too many distinct keys + const stringSlices = await splitSliceWithStringPrefixes(context, partition, preparedSlice, sortKeys, dimensionIndex) + if (isEffectiveSplit(preparedSlice, stringSlices)) { + logger.debug(`string-prefix split succeeded for partition ${partition.partitionId}: ${stringSlices.length} slices`) + return applyFocusedValue(stringSlices, focusedValue) + } + } + } + + if (sortKey.category === 'datetime' && (!rootLike || focusedValue !== undefined)) { + const temporalSlices = await splitSliceWithTemporalBuckets( + context, + partition, + markFocusedSlice(preparedSlice, focusedValue), + sortKeys, + dimensionIndex + ) + if (isEffectiveSplit(preparedSlice, temporalSlices)) { + logger.debug(`temporal bucket split succeeded for partition ${partition.partitionId}: ${temporalSlices.length} slices`) + return applyFocusedValue(temporalSlices, focusedValue) + } + } + + const rangedSlices = await splitWithRanges(context, partition, preparedSlice, sortKeys, dimensionIndex) + if (isEffectiveSplit(preparedSlice, rangedSlices)) { + logger.debug(`range-based split succeeded for partition ${partition.partitionId}: ${rangedSlices.length} slices`) + return applyFocusedValue(rangedSlices, focusedValue) + } + } + + logger.debug(`no effective split found for partition ${partition.partitionId} at depth ${depth}`) + + return [slice] +} + +async function splitWithRanges( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, +): Promise { + const sortKey = sortKeys[dimensionIndex] + if (!sortKey) return [slice] + + const range = getChunkRange(slice, dimensionIndex) + if (range.from === undefined || range.to === undefined) return [slice] + if (sortKey.category === 'string' && isExactChunkRange(range)) return [slice] + + const subCount = Math.ceil(slice.estimate.bytesUncompressed / context.targetChunkBytes) + if (subCount <= 1) return [slice] + + const quantileBoundaries = await buildQuantileBoundaries(context, slice, sortKeys, dimensionIndex, subCount) + if (quantileBoundaries) { + logger.debug( + `using quantile-aligned range split for partition ${partition.partitionId} on dimension ${dimensionIndex} with ${quantileBoundaries.length} boundaries` + ) + return splitSliceWithQuantiles(context, partition, slice, sortKeys, dimensionIndex, quantileBoundaries) + } + + logger.debug( + `falling back to equal-width range split for partition ${partition.partitionId} on dimension ${dimensionIndex} with ${subCount} subranges` + ) + + return splitSliceWithEqualWidthRanges( + context, + partition, + slice, + sortKeys, + dimensionIndex, + range.from, + range.to, + subCount + ) +} + +async function splitWithEqualWidthEstimate( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, +): Promise { + const estimateContext: PlannerContext = { + ...context, + rowProbeStrategy: 'explain-estimate', + } + return splitWithRanges(estimateContext, partition, slice, sortKeys, dimensionIndex) +} + +async function buildQuantileBoundaries( + context: PlannerContext, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, + subCount: number, +): Promise { + const range = getChunkRange(slice, dimensionIndex) + if (range.from === undefined || range.to === undefined) return undefined + + const steps = Array.from({ length: subCount - 1 }, (_, i) => i + 1) + const foundBoundaries = await pMap( + steps, + (step) => { + const targetCumRows = Math.round((slice.estimate.rows * step) / subCount) + return findQuantileBoundaryOnDimension(context, slice, sortKeys, dimensionIndex, targetCumRows) + }, + { concurrency: 10 }, + ) + const boundaries = [range.from, ...foundBoundaries] + + const uniqueBoundaryCount = new Set(boundaries).size + if (uniqueBoundaryCount <= Math.max(2, Math.ceil(subCount / 3))) { + logger.debug( + `discarded quantile boundaries for partition ${slice.partitionId} on dimension ${dimensionIndex} because only ${uniqueBoundaryCount} unique boundaries remained` + ) + return undefined + } + + return boundaries.concat([range.to]) +} + +async function hydrateSliceRange( + context: PlannerContext, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, +): Promise { + const existingRange = getChunkRange(slice, dimensionIndex) + if (existingRange.from !== undefined && existingRange.to !== undefined) { + return slice + } + + const sortKey = sortKeys[dimensionIndex] + if (!sortKey) return undefined + + const observedRange = await getSortKeyRange(context, slice.partitionId, slice.ranges, sortKeys, sortKey) + if (!observedRange) return undefined + + logger.debug( + `hydrated missing sort-key range for partition ${slice.partitionId} on ${sortKey.name}: [${observedRange.min}, ${observedRange.max}]` + ) + + return { + ...slice, + ranges: replaceChunkRange( + slice, + dimensionIndex, + observedRange.min, + toExclusiveUpperBound(observedRange.max, sortKey) + ), + } +} + +function toExclusiveUpperBound(value: string, sortKey: SortKey): string { + if (sortKey.category === 'string') { + return buildRootStringUpperBound(value) + } + if (sortKey.category === 'datetime') { + return new Date(parsePlannerDateTime(value) + 1000).toISOString() + } + return String(Number(value) + 1) +} + +function isEffectiveSplit(parentSlice: PartitionSlice, childSlices: PartitionSlice[]): boolean { + if (childSlices.length <= 1) return false + + return childSlices.some((childSlice) => + childSlice.estimate.rows !== parentSlice.estimate.rows || + JSON.stringify(childSlice.ranges) !== JSON.stringify(parentSlice.ranges) + ) +} + +function findFocusedValue( + slice: PartitionSlice, + sortKeys: SortKey[], +): { dimensionIndex: number; value: string } | undefined { + for (const range of slice.ranges) { + const sortKey = sortKeys[range.dimensionIndex] + if (sortKey?.category !== 'string') continue + if (isExactChunkRange(range) && range.from !== undefined) { + return { dimensionIndex: range.dimensionIndex, value: range.from } + } + } + return undefined +} + +function applyFocusedValue( + slices: PartitionSlice[], + focusedValue: { dimensionIndex: number; value: string } | undefined, +): PartitionSlice[] { + if (!focusedValue) return slices + return slices.map((slice) => markFocusedSlice(slice, focusedValue)) +} + +function markFocusedSlice( + slice: PartitionSlice, + focusedValue: { dimensionIndex: number; value: string } | undefined, +): PartitionSlice { + if (!focusedValue) return slice + return { + ...slice, + analysis: { + ...slice.analysis, + focusedValue, + }, + } +} + +function assignChunkIds(planId: string, slices: PartitionSlice[]): Chunk[] { + const chunkIndexes = new Map() + + return slices.map((slice) => { + const currentIndex = chunkIndexes.get(slice.partitionId) ?? 0 + chunkIndexes.set(slice.partitionId, currentIndex + 1) + return { + ...slice, + id: generateChunkId(planId, slice.partitionId, currentIndex), + } + }) +} + +function createTimedPlannerQuery( + input: Pick, +): PlannerQuery { + return async function timedPlannerQuery( + sql: string, + settings?: Record, + ): Promise { + const startedAt = performance.now() + const sqlSummary = summarizeSql(sql) + const operation = describeSqlOperation(sql) + const context = describeSqlContext(sql) + const queryLabel = context ? `${operation} (${context})` : operation + let repeatTimer: ReturnType | undefined + let repeatDelayMs = SLOW_CLICKHOUSE_QUERY_REPEAT_INITIAL_MS + const scheduleRepeatWarning = () => { + repeatTimer = setTimeout(() => { + const elapsedRepeatMs = Math.round(performance.now() - startedAt) + queryLogger.warning( + `clickhouse query still running for ${input.database}.${input.table} after ${elapsedRepeatMs}ms: ${queryLabel}` + ) + repeatDelayMs = Math.min(repeatDelayMs * 2, SLOW_CLICKHOUSE_QUERY_REPEAT_MAX_MS) + scheduleRepeatWarning() + }, repeatDelayMs) + } + const slowTimer = setTimeout(() => { + const elapsedMs = Math.round(performance.now() - startedAt) + queryLogger.warning( + `clickhouse query still running for ${input.database}.${input.table} after ${elapsedMs}ms: ${queryLabel} | ${sqlSummary}` + ) + scheduleRepeatWarning() + }, SLOW_CLICKHOUSE_QUERY_MS) + + queryLogger.debug(`clickhouse query started for ${input.database}.${input.table}: ${sqlSummary}`) + + try { + const rows = await input.query(sql, settings) + const durationMs = Math.round(performance.now() - startedAt) + + if (durationMs >= SLOW_CLICKHOUSE_QUERY_MS) { + queryLogger.debug( + `slow clickhouse query completed for ${input.database}.${input.table} in ${durationMs}ms (${rows.length} rows): ${queryLabel}` + ) + } else { + queryLogger.debug( + `clickhouse query completed for ${input.database}.${input.table} in ${durationMs}ms (${rows.length} rows): ${sqlSummary}` + ) + } + + return rows + } catch (error) { + queryLogger.error( + `clickhouse query failed for ${input.database}.${input.table} after ${Math.round(performance.now() - startedAt)}ms: ${sqlSummary}` + ) + throw error + } finally { + clearTimeout(slowTimer) + if (repeatTimer) clearTimeout(repeatTimer) + } + } +} diff --git a/packages/plugin-backfill/src/chunking/services/distribution-source.ts b/packages/plugin-backfill/src/chunking/services/distribution-source.ts new file mode 100644 index 0000000..48e884d --- /dev/null +++ b/packages/plugin-backfill/src/chunking/services/distribution-source.ts @@ -0,0 +1,109 @@ +import { buildWhereClauseFromRanges } from '../sql.js' +import type { + ChunkRange, + PlannerContext, + SortKey, + StringPrefixBucket, + TemporalBucket, +} from '../types.js' + +type QueryContext = Pick + +export async function probeStringPrefixDistribution( + context: QueryContext, + partitionId: string, + ranges: ChunkRange[], + sortKey: SortKey, + dimensionIndex: number, + depth: number, + sortKeys: SortKey[], +): Promise { + const range = ranges.find((candidate) => candidate.dimensionIndex === dimensionIndex) + if (!range?.from || !range.to) return [] + + const rows = await context.query<{ prefix: string; cnt: string }>(` +SELECT + substring(${sortKey.name}, 1, ${depth}) AS prefix, + count() AS cnt +FROM ${context.database}.${context.table} +WHERE ${buildWhereClauseFromRanges(partitionId, ranges, sortKeys)} +GROUP BY prefix +ORDER BY prefix`, + context.querySettings, + ) + + return rows.map((row) => ({ + value: row.prefix, + rowCount: Number(row.cnt), + isExactValue: Buffer.from(row.prefix, 'latin1').length < depth, + })) +} + +export interface StringKeyBucket { + value: string + rowCount: number +} + +export async function probeStringKeyDistribution( + context: QueryContext, + partitionId: string, + ranges: ChunkRange[], + sortKey: SortKey, + dimensionIndex: number, + sortKeys: SortKey[], + limit: number, +): Promise { + const range = ranges.find((candidate) => candidate.dimensionIndex === dimensionIndex) + if (!range?.from || !range.to) return undefined + + const rows = await context.query<{ key: string; cnt: string }>(` +SELECT + ${sortKey.name} AS key, + count() AS cnt +FROM ${context.database}.${context.table} +WHERE ${buildWhereClauseFromRanges(partitionId, ranges, sortKeys)} +GROUP BY key +ORDER BY cnt DESC +LIMIT ${limit + 1}`, + context.querySettings, + ) + + if (rows.length > limit) return undefined + + return rows.map((row) => ({ + value: row.key, + rowCount: Number(row.cnt), + })) +} + +export async function probeTemporalDistribution( + context: QueryContext, + partitionId: string, + ranges: ChunkRange[], + sortKeys: SortKey[], + dimensionIndex: number, + grain: 'day' | 'hour', +): Promise { + const sortKey = sortKeys[dimensionIndex] + if (!sortKey || sortKey.category !== 'datetime') return [] + + const bucketExpression = grain === 'day' + ? `toStartOfDay(${sortKey.name})` + : `toStartOfHour(${sortKey.name})` + + const rows = await context.query<{ bucket: string; cnt: string }>(` +SELECT + formatDateTime(${bucketExpression}, '%Y-%m-%dT%H:%i:%sZ') AS bucket, + count() AS cnt +FROM ${context.database}.${context.table} +WHERE ${buildWhereClauseFromRanges(partitionId, ranges, sortKeys)} +GROUP BY bucket +ORDER BY bucket`, + context.querySettings, + ) + + return rows.map((row) => ({ + start: row.bucket, + rowCount: Number(row.cnt), + })) +} diff --git a/packages/plugin-backfill/src/chunking/services/metadata-source.ts b/packages/plugin-backfill/src/chunking/services/metadata-source.ts new file mode 100644 index 0000000..f2ad3a6 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/services/metadata-source.ts @@ -0,0 +1,170 @@ +import type { Partition, PlannerContext, SortKey, SortKeyCategory } from '../types.js' + +/** ClickHouse returns timestamps without timezone — they are always UTC. */ +function parseClickHouseUTCTimestamp(value: string): string { + const trimmed = value.trim() + const normalized = trimmed.includes('T') ? trimmed : `${trimmed.replace(' ', 'T')}Z` + return new Date(normalized.endsWith('Z') ? normalized : `${normalized}Z`).toISOString() +} + +const NUMERIC_TYPES = /^(U?Int|Float|Decimal)/ +const DATETIME_TYPES = /^(Date|DateTime)/ + +function classifySortKeyType(type: string): SortKeyCategory { + if (NUMERIC_TYPES.test(type)) return 'numeric' + if (DATETIME_TYPES.test(type)) return 'datetime' + return 'string' +} + +function boundaryEncodingForCategory(category: SortKeyCategory): SortKey['boundaryEncoding'] { + return category === 'string' ? 'hex-latin1' : 'literal' +} + +function splitTopLevelCsv(input: string): string[] { + const parts: string[] = [] + let current = '' + let depth = 0 + let quote: '\'' | '"' | undefined + + for (let index = 0; index < input.length; index++) { + const char = input[index] + if (char === undefined) continue + + if (quote) { + current += char + if (char === quote && input[index - 1] !== '\\') quote = undefined + continue + } + + if (char === '\'' || char === '"') { + quote = char + current += char + continue + } + + if (char === '(') { + depth += 1 + current += char + continue + } + + if (char === ')') { + depth = Math.max(0, depth - 1) + current += char + continue + } + + if (char === ',' && depth === 0) { + parts.push(current.trim()) + current = '' + continue + } + + current += char + } + + if (current.trim().length > 0) { + parts.push(current.trim()) + } + + return parts +} + +function resolveSortKeyColumn(expression: string, knownColumns: Set): string | undefined { + const trimmed = expression.trim() + if (knownColumns.has(trimmed)) return trimmed + + const identifiers = Array.from(trimmed.matchAll(/\b[A-Za-z_][A-Za-z0-9_]*\b/g)) + .map((match) => match[0]) + .filter((identifier): identifier is string => Boolean(identifier)) + + const matches = Array.from(new Set(identifiers.filter((identifier) => knownColumns.has(identifier)))) + if (matches.length === 1) return matches[0] + if (knownColumns.size === 0 && identifiers.length > 0) { + return identifiers[identifiers.length - 1] + } + return undefined +} + +export async function introspectPartitions(context: PlannerContext): Promise { + await context.query( + `SELECT 1 FROM ${context.database}.${context.table} LIMIT 1 SETTINGS select_sequential_consistency = 1` + ) + + const rows = await context.query<{ + partition_id: string + total_rows: string + total_bytes: string + total_uncompressed_bytes?: string + min_time: string + max_time: string + }>(`SELECT + partition_id, + toString(sum(rows)) AS total_rows, + toString(sum(bytes_on_disk)) AS total_bytes, + toString(sum(data_uncompressed_bytes)) AS total_uncompressed_bytes, + toString(min(min_time)) AS min_time, + toString(max(max_time)) AS max_time +FROM system.parts +WHERE database = '${context.database}' + AND table = '${context.table}' + AND active = 1 +GROUP BY partition_id +ORDER BY partition_id +SETTINGS select_sequential_consistency = 1`) + + return rows + .map((row) => ({ + partitionId: row.partition_id, + rows: Number(row.total_rows), + bytesCompressed: Number(row.total_bytes), + bytesUncompressed: Number(row.total_uncompressed_bytes ?? row.total_bytes), + minTime: parseClickHouseUTCTimestamp(row.min_time), + maxTime: parseClickHouseUTCTimestamp(row.max_time), + })) + .filter((partition) => { + if (context.from && partition.maxTime < context.from) return false + if (context.to && partition.minTime >= context.to) return false + return true + }) +} + +export async function introspectSortKeys(context: PlannerContext): Promise { + const tableRows = await context.query<{ sorting_key: string }>( + `SELECT sorting_key FROM system.tables WHERE database = '${context.database}' AND name = '${context.table}'` + ) + + const sortingKey = tableRows[0]?.sorting_key + if (!sortingKey) return [] + + const expressions = splitTopLevelCsv(sortingKey) + if (expressions.length === 0) return [] + + const columnRows = await context.query<{ name?: string; type: string }>( + `SELECT name, type FROM system.columns WHERE database = '${context.database}' AND table = '${context.table}'` + ) + + const typeByName = new Map( + columnRows + .filter((row): row is { name: string; type: string } => Boolean(row.name)) + .map((row) => [row.name, row.type]) + ) + + const knownColumns = new Set(typeByName.keys()) + + return expressions.flatMap((expression, index) => { + const column = resolveSortKeyColumn(expression, knownColumns) + const type = column + ? typeByName.get(column) ?? columnRows[index]?.type ?? columnRows[0]?.type + : undefined + if (!column || !type) return [] + + const category = classifySortKeyType(type) + return [{ + name: column, + type, + category, + boundaryEncoding: boundaryEncodingForCategory(category), + }] + }) +} diff --git a/packages/plugin-backfill/src/chunking/services/row-probe.ts b/packages/plugin-backfill/src/chunking/services/row-probe.ts new file mode 100644 index 0000000..5b0e2c0 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/services/row-probe.ts @@ -0,0 +1,108 @@ +import { buildCountSql, buildEstimateSql, buildWhereClauseFromRanges } from '../sql.js' +import type { + ChunkRange, + EstimateFilter, + PlannerContext, + RowProbeStrategy, + SortKey, +} from '../types.js' + +type QueryContext = Pick + +export function getRowProbeStrategy(context: Pick): RowProbeStrategy { + return context.rowProbeStrategy +} + +export async function estimateRows( + context: PlannerContext, + filter: EstimateFilter, + sortKeys: SortKey[], +): Promise { + if (getRowProbeStrategy(context) === 'count') { + return countRowsExact(context, filter, sortKeys) + } + + const rows = await context.query>( + buildEstimateSql(filter, sortKeys, context, getRowProbeStrategy(context)), + context.querySettings, + ) + + const firstRow = rows[0] + if (!firstRow) return 0 + + for (const [key, value] of Object.entries(firstRow)) { + if (!key.toLowerCase().includes('row')) continue + const parsed = Number(value ?? 0) + if (Number.isFinite(parsed)) return parsed + } + + for (const value of Object.values(firstRow)) { + const parsed = Number(value ?? 0) + if (Number.isFinite(parsed)) return parsed + } + + return 0 +} + +export async function countRowsExact( + context: QueryContext, + filter: EstimateFilter, + sortKeys: SortKey[], +): Promise { + const rows = await context.query<{ cnt: string }>(buildCountSql(filter, sortKeys, context), context.querySettings) + return Number(rows[0]?.cnt ?? 0) +} + +export async function countRows( + context: QueryContext, + partitionId: string, + ranges: ChunkRange[], + sortKeys: SortKey[], +): Promise { + const filter: EstimateFilter = { + partitionId, + ranges, + exactDimensionIndex: undefined, + exactValue: undefined, + } + return countRowsExact(context, filter, sortKeys) +} + +export async function countPartitionRows( + context: QueryContext, + partitionId: string, +): Promise { + const rows = await context.query<{ cnt: string }>( + `SELECT count() AS cnt FROM ${context.database}.${context.table} WHERE _partition_id = '${partitionId}'`, + context.querySettings, + ) + return Number(rows[0]?.cnt ?? 0) +} + +export async function getSortKeyRange( + context: QueryContext, + partitionId: string, + ranges: ChunkRange[], + sortKeys: SortKey[], + sortKey: SortKey, +): Promise<{ min: string; max: string } | undefined> { + const rows = await context.query<{ minVal: string; maxVal: string }>(` +SELECT + toString(min(${sortKey.name})) AS minVal, + toString(max(${sortKey.name})) AS maxVal +FROM ${context.database}.${context.table} +WHERE ${buildWhereClauseFromRanges(partitionId, ranges, sortKeys)}`, + context.querySettings, + ) + + if (rows.length === 0) return undefined + return { + min: rows[0]?.minVal ?? '', + max: rows[0]?.maxVal ?? '', + } +} + +export function parsePlannerDateTime(value: string): number { + const normalized = value.includes('T') ? value : value.replace(' ', 'T') + return Date.parse(normalized.endsWith('Z') ? normalized : `${normalized}Z`) +} diff --git a/packages/plugin-backfill/src/chunking/smart-chunking.integration.test.ts b/packages/plugin-backfill/src/chunking/smart-chunking.integration.test.ts new file mode 100644 index 0000000..b7b4d6f --- /dev/null +++ b/packages/plugin-backfill/src/chunking/smart-chunking.integration.test.ts @@ -0,0 +1,451 @@ +import { describe, expect, test } from 'bun:test' + +import { analyzeAndChunk } from './analyze.js' +import { buildChunkExecutionSql } from './sql.js' +import type { Chunk, ChunkPlan } from './types.js' + +const MiB = 1024 ** 2 + +type RowValue = string | number + +interface FixtureRow { + _partition_id: string + event_time: string + [key: string]: RowValue +} + +function isoAt(day: number, hour: number, minute = 0): string { + return new Date(Date.UTC(2026, 0, day, hour, minute, 0)).toISOString() +} + +function createFixtureQuery(input: { + database: string + table: string + rows: FixtureRow[] + sortKeys: Array<{ column: string; type: string }> + bytesPerRow?: number + uncompressedBytesPerRow?: number +}) { + const bytesPerRow = input.bytesPerRow ?? 1024 + const uncompressedBytesPerRow = input.uncompressedBytesPerRow ?? bytesPerRow * 2 + + return async function query(sql: string): Promise { + if (sql.includes(`SELECT 1 FROM ${input.database}.${input.table} LIMIT 1`)) { + return [{ ok: 1 }] as T[] + } + + if (sql.includes('FROM system.parts')) { + return summarizePartitions(input.rows, bytesPerRow, uncompressedBytesPerRow) as T[] + } + + if (sql.includes('FROM system.tables')) { + return [{ sorting_key: input.sortKeys.map((key) => key.column).join(', ') }] as T[] + } + + if (sql.includes('FROM system.columns')) { + return input.sortKeys.map((key) => ({ name: key.column, type: key.type })) as T[] + } + + const filteredRows = filterRows(sql, input.rows) + + if (sql.startsWith('EXPLAIN ESTIMATE')) { + return [{ rows: String(filteredRows.length) }] as T[] + } + + if (sql.includes(' AS key') && sql.includes('GROUP BY key')) { + const match = sql.match(/^\s*SELECT\s+(\w+)\s+AS key/m) + const column = match?.[1] + if (!column) return [] as T[] + + const limitMatch = sql.match(/LIMIT\s+(\d+)/) + const limit = limitMatch ? Number(limitMatch[1]) : Infinity + + const grouped = new Map() + for (const row of filteredRows) { + const value = String(row[column] ?? '') + grouped.set(value, (grouped.get(value) ?? 0) + 1) + } + + return Array.from(grouped.entries()) + .sort(([, a], [, b]) => b - a) + .slice(0, limit) + .map(([key, cnt]) => ({ key, cnt: String(cnt) })) as T[] + } + + if (sql.includes('substring(')) { + const match = sql.match(/substring\((\w+), 1, (\d+)\) AS prefix/) + const column = match?.[1] + const depth = Number(match?.[2] ?? 0) + if (!column || depth <= 0) return [] as T[] + + const grouped = new Map() + for (const row of filteredRows) { + const value = String(row[column] ?? '') + const prefix = Buffer.from(value, 'latin1').subarray(0, depth).toString('latin1') + grouped.set(prefix, (grouped.get(prefix) ?? 0) + 1) + } + + return Array.from(grouped.entries()) + .sort(([left], [right]) => compareLatin1(left, right)) + .map(([prefix, cnt]) => ({ prefix, cnt: String(cnt) })) as T[] + } + + if (sql.includes('formatDateTime(toStartOfDay(') || sql.includes('formatDateTime(toStartOfHour(')) { + const grain = sql.includes('toStartOfDay(') ? 'day' : 'hour' + const columnMatch = sql.match(/toStartOf(?:Day|Hour)\((\w+)\)/) + const column = columnMatch?.[1] + if (!column) return [] as T[] + + const grouped = new Map() + for (const row of filteredRows) { + const bucket = grain === 'day' ? toStartOfDay(String(row[column])) : toStartOfHour(String(row[column])) + grouped.set(bucket, (grouped.get(bucket) ?? 0) + 1) + } + + return Array.from(grouped.entries()) + .sort(([left], [right]) => left.localeCompare(right)) + .map(([bucket, cnt]) => ({ bucket, cnt: String(cnt) })) as T[] + } + + if (sql.includes('toString(min(') && sql.includes('toString(max(')) { + const match = sql.match(/toString\(min\((\w+)\)\) AS minVal,\s+toString\(max\(\1\)\) AS maxVal/s) + const column = match?.[1] + if (!column || filteredRows.length === 0) return [] as T[] + + const values = filteredRows.map((row) => row[column]).filter((value) => value !== undefined) + if (values.length === 0) return [] as T[] + + return [{ + minVal: formatValueForMinMax(values.reduce((current, candidate) => compareValues(candidate, current) < 0 ? candidate : current)), + maxVal: formatValueForMinMax(values.reduce((current, candidate) => compareValues(candidate, current) > 0 ? candidate : current)), + }] as T[] + } + + if (sql.includes('SELECT count() AS cnt')) { + return [{ cnt: String(filteredRows.length) }] as T[] + } + + return [] as T[] + } +} + +function summarizePartitions(rows: FixtureRow[], bytesPerRow: number, uncompressedBytesPerRow: number) { + const byPartition = new Map() + for (const row of rows) { + const list = byPartition.get(row._partition_id) + if (list) list.push(row) + else byPartition.set(row._partition_id, [row]) + } + + return Array.from(byPartition.entries()) + .sort(([left], [right]) => left.localeCompare(right)) + .map(([partitionId, partitionRows]) => ({ + partition_id: partitionId, + total_rows: String(partitionRows.length), + total_bytes: String(partitionRows.length * bytesPerRow), + total_uncompressed_bytes: String(partitionRows.length * uncompressedBytesPerRow), + min_time: String(partitionRows.reduce((min, row) => row.event_time < min ? row.event_time : min, partitionRows[0]?.event_time ?? '')), + max_time: String(partitionRows.reduce((max, row) => row.event_time > max ? row.event_time : max, partitionRows[0]?.event_time ?? '')), + })) +} + +function filterRows(sql: string, rows: FixtureRow[]): FixtureRow[] { + const whereMatch = sql.match(/WHERE\s+([\s\S]*?)(?:GROUP BY|ORDER BY|SETTINGS|$)/i) + if (!whereMatch?.[1]) return rows + + const clauses = whereMatch[1] + .split(/\s+AND\s+/) + .map((clause) => clause.replace(/\s+/g, ' ').trim()) + .filter(Boolean) + + return rows.filter((row) => clauses.every((clause) => evaluateClause(clause, row))) +} + +function evaluateClause(clause: string, row: FixtureRow): boolean { + let match = clause.match(/^_partition_id = '([^']+)'$/) + if (match) return row._partition_id === match[1] + + match = clause.match(/^(\w+) >= parseDateTimeBestEffort\('([^']+)'\)$/) + if (match) return Date.parse(String(row[match[1]])) >= Date.parse(match[2]) + + match = clause.match(/^(\w+) < parseDateTimeBestEffort\('([^']+)'\)$/) + if (match) return Date.parse(String(row[match[1]])) < Date.parse(match[2]) + + match = clause.match(/^(\w+) >= unhex\('([0-9a-f]*)'\)$/i) + if (match) return compareLatin1(String(row[match[1]] ?? ''), Buffer.from(match[2], 'hex').toString('latin1')) >= 0 + + match = clause.match(/^(\w+) < unhex\('([0-9a-f]*)'\)$/i) + if (match) return compareLatin1(String(row[match[1]] ?? ''), Buffer.from(match[2], 'hex').toString('latin1')) < 0 + + match = clause.match(/^(\w+) >= '([^']+)'$/) + if (match) return comparePrimitive(row[match[1]], match[2]) >= 0 + + match = clause.match(/^(\w+) < '([^']+)'$/) + if (match) return comparePrimitive(row[match[1]], match[2]) < 0 + + match = clause.match(/^(\w+) >= (-?\d+(?:\.\d+)?)$/) + if (match) return Number(row[match[1]]) >= Number(match[2]) + + match = clause.match(/^(\w+) < (-?\d+(?:\.\d+)?)$/) + if (match) return Number(row[match[1]]) < Number(match[2]) + + throw new Error(`Unsupported test clause: ${clause}`) +} + +function comparePrimitive(left: RowValue | undefined, right: string): number { + if (typeof left === 'number') return left - Number(right) + return String(left ?? '').localeCompare(right) +} + +function compareValues(left: RowValue, right: RowValue): number { + if (typeof left === 'number' && typeof right === 'number') return left - right + return compareLatin1(String(left), String(right)) +} + +function formatValueForMinMax(value: RowValue): string { + return String(value) +} + +function compareLatin1(left: string, right: string): number { + return Buffer.from(left, 'latin1').compare(Buffer.from(right, 'latin1')) +} + +function toStartOfDay(value: string): string { + const date = new Date(value) + return new Date(Date.UTC(date.getUTCFullYear(), date.getUTCMonth(), date.getUTCDate(), 0, 0, 0)).toISOString() +} + +function toStartOfHour(value: string): string { + const date = new Date(value) + return new Date(Date.UTC(date.getUTCFullYear(), date.getUTCMonth(), date.getUTCDate(), date.getUTCHours(), 0, 0)).toISOString() +} + +async function planFixture(input: { + rows: FixtureRow[] + sortKeys: Array<{ column: string; type: string }> + maxChunkBytes: number +}): Promise { + const query = createFixtureQuery({ + database: 'app', + table: 'events', + rows: input.rows, + sortKeys: input.sortKeys, + }) + + return analyzeAndChunk({ + database: 'app', + table: 'events', + targetChunkBytes: input.maxChunkBytes, + query, + }) +} + +function strategyIds(chunk: Pick): string[] { + return chunk.analysis.lineage.map((step) => step.strategyId) +} + +function buildSqlForChunk(plan: ChunkPlan, chunk: Chunk): string { + return buildChunkExecutionSql({ + planId: 'fixture-plan', + chunk, + target: 'app.events', + sourceTarget: 'app.events', + table: plan.table, + }) +} + +function requireChunk(value: T | undefined, label: string): T { + if (value === undefined) { + throw new Error(`Missing expected chunk: ${label}`) + } + return value +} + +describe('smart chunking integration', () => { + test('keeps small partitions as a single metadata chunk', async () => { + const rows = Array.from({ length: 12 }, (_, index) => ({ + _partition_id: 'p_small', + event_time: isoAt(1, index), + id: index, + })) + + const plan = await planFixture({ + rows, + sortKeys: [{ column: 'id', type: 'UInt64' }], + maxChunkBytes: 64 * MiB, + }) + + expect(plan.chunks).toHaveLength(1) + expect(plan.chunks[0]?.estimate.reason).toBe('partition-metadata') + expect(strategyIds(requireChunk(plan.chunks[0], 'metadata chunk'))).toHaveLength(0) + }) + + test('uses quantile range splitting for wide numeric distributions', async () => { + const rows = Array.from({ length: 120 }, (_, index) => ({ + _partition_id: 'p_quantile', + event_time: isoAt(2, index % 24), + id: index, + })) + + const plan = await planFixture({ + rows, + sortKeys: [{ column: 'id', type: 'UInt64' }], + maxChunkBytes: 60 * 1024, + }) + + expect(plan.chunks.length).toBeGreaterThanOrEqual(3) + expect(plan.chunks.every((chunk) => strategyIds(chunk).includes('quantile-range-split'))).toBe(true) + + const estimatedRows = plan.chunks.map((chunk) => chunk.estimate.rows) + expect(Math.max(...estimatedRows) - Math.min(...estimatedRows)).toBeLessThanOrEqual(4) + }) + + test('falls back to equal-width splitting when quantile boundaries collapse', async () => { + const rows = Array.from({ length: 80 }, (_, index) => ({ + _partition_id: 'p_equal', + event_time: isoAt(3, index % 24), + id: 100 + (index % 2), + })) + + const plan = await planFixture({ + rows, + sortKeys: [{ column: 'id', type: 'UInt64' }], + maxChunkBytes: 40 * 1024, + }) + + expect(plan.chunks.length).toBeGreaterThan(1) + expect(plan.chunks.some((chunk) => strategyIds(chunk).includes('equal-width-split'))).toBe(true) + expect(plan.chunks.every((chunk) => chunk.estimate.rows > 0)).toBe(true) + expect(plan.chunks.every((chunk) => + chunk.ranges.every((range) => range.from !== range.to) + )).toBe(true) + }) + + test('uses string key splitting for string-distributed partitions', async () => { + const rows: FixtureRow[] = [] + for (const prefix of ['apple', 'apricot', 'banana', 'berry', 'citrus']) { + for (let index = 0; index < 24; index++) { + rows.push({ + _partition_id: 'p_string', + event_time: isoAt(4, index % 24), + slug: `${prefix}-${index.toString().padStart(2, '0')}`, + }) + } + } + + const plan = await planFixture({ + rows, + sortKeys: [{ column: 'slug', type: 'String' }], + maxChunkBytes: 48 * 1024, + }) + + expect(plan.chunks.length).toBeGreaterThan(2) + const usesStringStrategy = plan.chunks.some((chunk) => + strategyIds(chunk).includes('group-by-key-split') || + strategyIds(chunk).includes('string-prefix-split') + ) + expect(usesStringStrategy).toBe(true) + + const sql = buildSqlForChunk(plan, requireChunk(plan.chunks[0], 'string-key first chunk')) + expect(sql).toContain("unhex('") + }) + + test('combines string-prefix and temporal splitting for focused time windows', async () => { + const rows: FixtureRow[] = [] + + for (let day = 1; day <= 3; day++) { + for (let hour = 0; hour < 24; hour++) { + rows.push({ + _partition_id: 'p_combo_temporal', + event_time: isoAt(10 + day, hour), + user_id: 'hot', + score: 1000 + day * 24 + hour, + }) + } + } + + for (let index = 0; index < 18; index++) { + rows.push({ + _partition_id: 'p_combo_temporal', + event_time: isoAt(10, index), + user_id: `cold-${index}`, + score: index, + }) + } + + const plan = await planFixture({ + rows, + sortKeys: [ + { column: 'user_id', type: 'String' }, + { column: 'event_time', type: 'DateTime' }, + ], + maxChunkBytes: 36 * 1024, + }) + + const hotChunks = plan.chunks.filter((chunk) => + strategyIds(chunk).includes('temporal-bucket-split') && + chunk.ranges.some((range) => range.dimensionIndex === 0) && + chunk.ranges.some((range) => range.dimensionIndex === 1) + ) + + expect(hotChunks.length).toBeGreaterThan(0) + expect(hotChunks.every((chunk) => chunk.analysis.focusedValue?.value === 'hot')).toBe(true) + + const sql = buildSqlForChunk(plan, requireChunk(hotChunks[0], 'temporal combo chunk')) + expect(sql).toContain('user_id >=') + expect(sql).toContain('event_time >=') + expect(sql).toContain('parseDateTimeBestEffort') + + const temporalRanges = hotChunks + .map((chunk) => chunk.ranges.find((range) => range.dimensionIndex === 1)) + .filter((range): range is NonNullable => Boolean(range)) + .sort((left, right) => String(left.from).localeCompare(String(right.from))) + + for (let index = 1; index < temporalRanges.length; index++) { + expect(temporalRanges[index - 1]?.to).toBe(temporalRanges[index]?.from) + } + }) + + test('combines string-prefix and quantile splitting on secondary numeric dimensions', async () => { + const rows: FixtureRow[] = [] + + for (let index = 0; index < 96; index++) { + rows.push({ + _partition_id: 'p_combo_numeric', + event_time: isoAt(20, index % 24), + account: 'vip', + seq: index, + }) + } + + for (let index = 0; index < 24; index++) { + rows.push({ + _partition_id: 'p_combo_numeric', + event_time: isoAt(20, index % 24), + account: `free-${index}`, + seq: index, + }) + } + + const plan = await planFixture({ + rows, + sortKeys: [ + { column: 'account', type: 'String' }, + { column: 'seq', type: 'UInt64' }, + ], + maxChunkBytes: 48 * 1024, + }) + + const comboChunks = plan.chunks.filter((chunk) => + strategyIds(chunk).includes('quantile-range-split') && + chunk.ranges.some((range) => range.dimensionIndex === 0) && + chunk.ranges.some((range) => range.dimensionIndex === 1) + ) + + expect(comboChunks.length).toBeGreaterThan(0) + + const sql = buildSqlForChunk(plan, requireChunk(comboChunks[0], 'numeric combo chunk')) + expect(sql).toContain('account >=') + expect(sql).toContain("seq >= '") + }) +}) diff --git a/packages/plugin-backfill/src/chunking/splitter.test.ts b/packages/plugin-backfill/src/chunking/splitter.test.ts deleted file mode 100644 index 16f4f3f..0000000 --- a/packages/plugin-backfill/src/chunking/splitter.test.ts +++ /dev/null @@ -1,64 +0,0 @@ -import { describe, expect, test } from 'bun:test' - -import { splitSortKeyRange, stringToUint64, uint64ToString } from './splitter.js' - -describe('splitSortKeyRange', () => { - test('numeric: splits into equal-width ranges', () => { - const ranges = splitSortKeyRange('numeric', '100', '200', 2) - - expect(ranges).toHaveLength(2) - expect(ranges[0]?.from).toBe('100') - expect(ranges[0]?.to).toBe('150') - expect(ranges[1]?.from).toBe('150') - expect(ranges[1]?.to).toBe('201') - }) - - test('datetime: splits into equal-width time ranges', () => { - const ranges = splitSortKeyRange('datetime', '2025-01-01 00:00:00', '2025-01-31 00:00:00', 3) - - expect(ranges).toHaveLength(3) - for (const r of ranges) { - expect(r.from).toBeDefined() - expect(r.to).toBeDefined() - } - }) - - test('string: round-trips through uint64 conversion', () => { - const ranges = splitSortKeyRange('string', 'aaa', 'zzz', 2) - - expect(ranges).toHaveLength(2) - expect(ranges[0]?.from).toBeDefined() - expect(ranges[1]?.to).toBeDefined() - }) -}) - -describe('stringToUint64 / uint64ToString', () => { - test('round-trips short strings', () => { - const original = 'abc' - const n = stringToUint64(original) - const back = uint64ToString(n) - expect(back).toBe(original) - }) - - test('round-trips 8-byte strings', () => { - const original = 'abcdefgh' - const n = stringToUint64(original) - const back = uint64ToString(n) - expect(back).toBe(original) - }) - - test('truncates strings longer than 8 bytes', () => { - const n = stringToUint64('abcdefghijklmnop') - const back = uint64ToString(n) - expect(back).toBe('abcdefgh') - }) - - test('handles embedded zero bytes from arithmetic', () => { - // Simulates a computed intermediate where a middle byte is 0x00 - // e.g. 0x6200000000000001 has zero bytes between 'b' and the trailing 0x01 - const n = 0x6200000000000001n - const result = uint64ToString(n) - expect(result).toBe('b\0\0\0\0\0\0\x01') - expect(result.length).toBe(8) - }) -}) diff --git a/packages/plugin-backfill/src/chunking/splitter.ts b/packages/plugin-backfill/src/chunking/splitter.ts deleted file mode 100644 index c3a3e5c..0000000 --- a/packages/plugin-backfill/src/chunking/splitter.ts +++ /dev/null @@ -1,86 +0,0 @@ -import type { SortKeyInfo } from './types.js' - -export function splitNumericRange(min: number, max: number, count: number): Array<{ from: string; to: string }> { - const span = max - min - const step = span / count - const ranges: Array<{ from: string; to: string }> = [] - for (let i = 0; i < count; i++) { - const from = min + i * step - const to = i === count - 1 ? max + 1 : min + (i + 1) * step - ranges.push({ from: String(from), to: String(to) }) - } - return ranges -} - -export function splitDateTimeRange(min: string, max: string, count: number): Array<{ from: string; to: string }> { - const minMs = new Date(min).getTime() - const maxMs = new Date(max).getTime() - const span = maxMs - minMs - const step = span / count - const ranges: Array<{ from: string; to: string }> = [] - for (let i = 0; i < count; i++) { - const from = new Date(minMs + i * step).toISOString() - const to = i === count - 1 - ? new Date(maxMs + 1).toISOString() - : new Date(minMs + (i + 1) * step).toISOString() - ranges.push({ from, to }) - } - return ranges -} - -export function stringToUint64(s: string): bigint { - let result = 0n - const bytes = Math.min(s.length, 8) - for (let i = 0; i < bytes; i++) { - result = (result << 8n) | BigInt(s.charCodeAt(i)) - } - // Pad remaining bytes with zeros - for (let i = bytes; i < 8; i++) { - result = result << 8n - } - return result -} - -export function uint64ToString(n: bigint): string { - const chars: string[] = [] - for (let i = 7; i >= 0; i--) { - const byte = Number((n >> BigInt(i * 8)) & 0xffn) - chars.push(String.fromCharCode(byte)) - } - // Trim trailing NUL bytes (padding from stringToUint64 for short strings) - let end = chars.length - while (end > 0 && chars[end - 1] === '\0') end-- - return chars.slice(0, end).join('') -} - -export function splitStringRange(min: string, max: string, count: number): Array<{ from: string; to: string }> { - const minVal = stringToUint64(min) - const maxVal = stringToUint64(max) - const span = maxVal - minVal - const step = span / BigInt(count) - const ranges: Array<{ from: string; to: string }> = [] - for (let i = 0; i < count; i++) { - const from = uint64ToString(minVal + BigInt(i) * step) - const to = i === count - 1 - ? uint64ToString(maxVal + 1n) - : uint64ToString(minVal + BigInt(i + 1) * step) - ranges.push({ from, to }) - } - return ranges -} - -export function splitSortKeyRange( - category: SortKeyInfo['category'], - min: string, - max: string, - count: number, -): Array<{ from: string; to: string }> { - switch (category) { - case 'numeric': - return splitNumericRange(Number(min), Number(max), count) - case 'datetime': - return splitDateTimeRange(min, max, count) - case 'string': - return splitStringRange(min, max, count) - } -} diff --git a/packages/plugin-backfill/src/chunking/sql.ts b/packages/plugin-backfill/src/chunking/sql.ts index e6b7458..cd57901 100644 --- a/packages/plugin-backfill/src/chunking/sql.ts +++ b/packages/plugin-backfill/src/chunking/sql.ts @@ -1,48 +1,101 @@ -import type { PlannedChunk, SortKeyInfo } from './types.js' +import type { + Chunk, + ChunkRange, + EstimateFilter, + PlannerContext, + RowProbeStrategy, + SortKey, + TableProfile, +} from './types.js' + + +export function quoteSqlString(value: string): string { + return `'${value.replaceAll('\\', '\\\\').replaceAll('\'', '\\\'')}'` +} + +export function formatBound(value: string, sortKey: SortKey): string { + if (sortKey.category === 'datetime') { + return `parseDateTimeBestEffort(${quoteSqlString(value)})` + } + + if (sortKey.category === 'string') { + return `unhex('${Buffer.from(value, 'latin1').toString('hex')}')` + } + + return quoteSqlString(value) +} + +export function buildWhereClauseFromRanges( + partitionId: string, + ranges: ChunkRange[], + sortKeys: SortKey[], +): string { + const conditions = [`_partition_id = ${quoteSqlString(partitionId)}`] + + for (const range of ranges) { + const sortKey = sortKeys[range.dimensionIndex] + if (!sortKey) continue + + if (range.from !== undefined) { + conditions.push(`${sortKey.name} >= ${formatBound(range.from, sortKey)}`) + } + if (range.to !== undefined) { + conditions.push(`${sortKey.name} < ${formatBound(range.to, sortKey)}`) + } + } + + return conditions.join('\n AND ') +} + +export function buildWhereClauseFromChunk( + chunk: Pick, + table: Pick, +): string { + return buildWhereClauseFromRanges(chunk.partitionId, chunk.ranges, table.sortKeys) +} function buildSettingsClause(token: string): string { if (token) { return `SETTINGS async_insert=0, insert_deduplication_token='${token}'` } - return `SETTINGS async_insert=0` + return 'SETTINGS async_insert=0' } -function buildSortKeyCondition( - sortKeyColumn: string, - category: SortKeyInfo['category'], - from: string, - to: string, -): string { - if (category === 'datetime') { - return ` AND ${sortKeyColumn} >= parseDateTimeBestEffort('${from}')\n AND ${sortKeyColumn} < parseDateTimeBestEffort('${to}')` - } - // numeric and string use direct comparison - return ` AND ${sortKeyColumn} >= '${from}'\n AND ${sortKeyColumn} < '${to}'` +function buildChunkConditions(chunk: Pick, sortKeys: SortKey[]): string[] { + return chunk.ranges.flatMap((range) => { + const sortKey = sortKeys[range.dimensionIndex] + if (!sortKey) return [] + + const conditions: string[] = [] + if (range.from !== undefined) { + conditions.push(`${sortKey.name} >= ${formatBound(range.from, sortKey)}`) + } + if (range.to !== undefined) { + conditions.push(`${sortKey.name} < ${formatBound(range.to, sortKey)}`) + } + return conditions + }) } -export function buildChunkSql(input: { +export function buildChunkExecutionSql(input: { planId: string - chunk: PlannedChunk + chunk: Chunk target: string - sortKey?: SortKeyInfo + table: Pick + sourceTarget?: string mvAsQuery?: string targetColumns?: string[] + idempotencyToken?: string }): string { - const header = `/* chkit backfill plan=${input.planId} chunk=${input.chunk.id} token=${input.chunk.idempotencyToken} */` - const settings = buildSettingsClause(input.chunk.idempotencyToken) - const { chunk } = input + const sourceTarget = input.sourceTarget ?? input.target + const header = `/* chkit backfill plan=${input.planId} chunk=${input.chunk.id} token=${input.idempotencyToken ?? ''} */` + const settings = buildSettingsClause(input.idempotencyToken ?? '') + const chunkConditions = buildChunkConditions(input.chunk, input.table.sortKeys) if (input.mvAsQuery) { - // MV replay: inject partition + sort key filters into the MV's AS query - let filtered = injectPartitionFilter(input.mvAsQuery, chunk.partitionId) - if (chunk.sortKeyFrom !== undefined && chunk.sortKeyTo !== undefined && input.sortKey) { - filtered = injectSortKeyFilter( - filtered, - input.sortKey.column, - input.sortKey.category, - chunk.sortKeyFrom, - chunk.sortKeyTo, - ) + let filtered = injectPartitionFilter(input.mvAsQuery, input.chunk.partitionId) + for (const condition of chunkConditions) { + filtered = injectWhereCondition(filtered, condition) } if (input.targetColumns?.length) { filtered = rewriteSelectColumns(filtered, input.targetColumns) @@ -50,48 +103,96 @@ export function buildChunkSql(input: { return [header, `INSERT INTO ${input.target}`, filtered, settings].join('\n') } - // Direct table copy const lines = [ header, `INSERT INTO ${input.target}`, - `SELECT *`, - `FROM ${input.target}`, - `WHERE _partition_id = '${chunk.partitionId}'`, + 'SELECT *', + `FROM ${sourceTarget}`, + `WHERE _partition_id = ${quoteSqlString(input.chunk.partitionId)}`, ] - if (chunk.sortKeyFrom !== undefined && chunk.sortKeyTo !== undefined && input.sortKey) { - lines.push(buildSortKeyCondition( - input.sortKey.column, - input.sortKey.category, - chunk.sortKeyFrom, - chunk.sortKeyTo, - )) + for (const condition of chunkConditions) { + lines.push(` AND ${condition}`) } lines.push(settings) return lines.join('\n') } -// --- SQL helpers --- +export function buildEstimateSql( + filter: EstimateFilter, + sortKeys: SortKey[], + context: PlannerContext, + rowProbeStrategy: RowProbeStrategy, +): string { + const whereClause = buildWhereClauseFromFilter(filter, sortKeys) + if (rowProbeStrategy === 'count') { + return `SELECT count() AS cnt FROM ${context.database}.${context.table} WHERE ${whereClause}` + } + return `EXPLAIN ESTIMATE SELECT count() FROM ${context.database}.${context.table} WHERE ${whereClause}` +} + +export function buildCountSql( + filter: EstimateFilter, + sortKeys: SortKey[], + context: Pick, +): string { + return `SELECT count() AS cnt FROM ${context.database}.${context.table} WHERE ${buildWhereClauseFromFilter(filter, sortKeys)}` +} + +function buildWhereClauseFromFilter( + filter: EstimateFilter, + sortKeys: SortKey[], +): string { + const conditions = [`_partition_id = ${quoteSqlString(filter.partitionId)}`] + + for (const range of filter.ranges) { + const sortKey = sortKeys[range.dimensionIndex] + if (!sortKey) continue + + if (filter.exactDimensionIndex === range.dimensionIndex && filter.exactValue !== undefined) { + conditions.push(`${sortKey.name} = ${formatBound(filter.exactValue, sortKey)}`) + continue + } + + if (range.from !== undefined) { + conditions.push(`${sortKey.name} >= ${formatBound(range.from, sortKey)}`) + } + if (range.to !== undefined) { + conditions.push(`${sortKey.name} < ${formatBound(range.to, sortKey)}`) + } + } + + return conditions.join(' AND ') +} function injectPartitionFilter(query: string, partitionId: string): string { - const condition = `_partition_id = '${partitionId}'` - return injectWhereCondition(query, condition) + return injectWhereCondition(query, `_partition_id = ${quoteSqlString(partitionId)}`) } export function injectSortKeyFilter( query: string, sortKeyColumn: string, - category: SortKeyInfo['category'], + category: SortKey['category'], from: string, to: string, ): string { let condition: string + if (category === 'datetime') { - condition = `${sortKeyColumn} >= parseDateTimeBestEffort('${from}')\n AND ${sortKeyColumn} < parseDateTimeBestEffort('${to}')` + condition = + `${sortKeyColumn} >= parseDateTimeBestEffort(${quoteSqlString(from)})\n` + + ` AND ${sortKeyColumn} < parseDateTimeBestEffort(${quoteSqlString(to)})` + } else if (category === 'string') { + condition = + `${sortKeyColumn} >= unhex('${Buffer.from(from, 'latin1').toString('hex')}')\n` + + ` AND ${sortKeyColumn} < unhex('${Buffer.from(to, 'latin1').toString('hex')}')` } else { - condition = `${sortKeyColumn} >= '${from}'\n AND ${sortKeyColumn} < '${to}'` + condition = + `${sortKeyColumn} >= ${quoteSqlString(from)}\n` + + ` AND ${sortKeyColumn} < ${quoteSqlString(to)}` } + return injectWhereCondition(query, condition) } @@ -99,40 +200,51 @@ function injectWhereCondition(query: string, condition: string): string { const trimmed = query.trimEnd() const upper = trimmed.toUpperCase() - interface KWHit { keyword: string; position: number } - const hits: KWHit[] = [] + interface KeywordHit { + keyword: string + position: number + } + + const hits: KeywordHit[] = [] let depth = 0 - for (let i = 0; i < trimmed.length; i++) { - const ch = trimmed[i] - if (ch === '(') { depth++; continue } - if (ch === ')') { depth--; continue } - if (ch === "'") { - i++ - while (i < trimmed.length && trimmed[i] !== "'") { - if (trimmed[i] === '\\') i++ - i++ + for (let index = 0; index < trimmed.length; index++) { + const char = trimmed[index] + if (char === '(') { + depth += 1 + continue + } + if (char === ')') { + depth -= 1 + continue + } + if (char === '\'') { + index += 1 + while (index < trimmed.length && trimmed[index] !== '\'') { + if (trimmed[index] === '\\') index += 1 + index += 1 } continue } if (depth !== 0) continue - - if (i > 0 && /\S/.test(trimmed[i - 1] ?? '')) continue - - const rest = upper.slice(i) - for (const kw of ['WHERE', 'GROUP BY', 'HAVING', 'ORDER BY', 'QUALIFY', 'LIMIT', 'SETTINGS']) { - if (rest.startsWith(kw) && (i + kw.length >= trimmed.length || /\s/.test(trimmed[i + kw.length] ?? ''))) { - hits.push({ keyword: kw, position: i }) + if (index > 0 && /\S/.test(trimmed[index - 1] ?? '')) continue + + const rest = upper.slice(index) + for (const keyword of ['WHERE', 'GROUP BY', 'HAVING', 'ORDER BY', 'QUALIFY', 'LIMIT', 'SETTINGS']) { + if ( + rest.startsWith(keyword) && + (index + keyword.length >= trimmed.length || /\s/.test(trimmed[index + keyword.length] ?? '')) + ) { + hits.push({ keyword, position: index }) break } } } - const whereHit = hits.find(h => h.keyword === 'WHERE') - const trailingKeywords = ['GROUP BY', 'HAVING', 'ORDER BY', 'QUALIFY', 'LIMIT', 'SETTINGS'] + const whereHit = hits.find((hit) => hit.keyword === 'WHERE') const firstTrailing = hits - .filter(h => trailingKeywords.includes(h.keyword)) - .filter(h => !whereHit || h.position > whereHit.position)[0] + .filter((hit) => hit.keyword !== 'WHERE') + .filter((hit) => !whereHit || hit.position > whereHit.position)[0] const insertAt = firstTrailing ? firstTrailing.position : trimmed.length const before = trimmed.slice(0, insertAt).trimEnd() @@ -141,6 +253,7 @@ function injectWhereCondition(query: string, condition: string): string { if (whereHit) { return `${before}\n AND ${condition}${after ? `\n${after}` : ''}` } + return `${before}\nWHERE ${condition}${after ? `\n${after}` : ''}` } @@ -152,57 +265,85 @@ export function rewriteSelectColumns(query: string, targetColumns: string[]): st let fromPos = -1 let depth = 0 - for (let i = 0; i < trimmed.length; i++) { - const ch = trimmed[i] - if (ch === '(') { depth++; continue } - if (ch === ')') { depth--; continue } - if (ch === "'") { - i++ - while (i < trimmed.length && trimmed[i] !== "'") { - if (trimmed[i] === '\\') i++ - i++ + for (let index = 0; index < trimmed.length; index++) { + const char = trimmed[index] + if (char === '(') { + depth += 1 + continue + } + if (char === ')') { + depth -= 1 + continue + } + if (char === '\'') { + index += 1 + while (index < trimmed.length && trimmed[index] !== '\'') { + if (trimmed[index] === '\\') index += 1 + index += 1 } continue } if (depth !== 0) continue - - if (i > 0 && /\S/.test(trimmed[i - 1] ?? '')) continue - - const rest = upper.slice(i) - if (selectPos === -1 && rest.startsWith('SELECT') && (i + 6 >= trimmed.length || /\s/.test(trimmed[i + 6] ?? ''))) { - selectPos = i - } else if (selectPos !== -1 && fromPos === -1 && rest.startsWith('FROM') && (i + 4 >= trimmed.length || /\s/.test(trimmed[i + 4] ?? ''))) { - fromPos = i + if (index > 0 && /\S/.test(trimmed[index - 1] ?? '')) continue + + const rest = upper.slice(index) + if ( + selectPos === -1 && + rest.startsWith('SELECT') && + (index + 6 >= trimmed.length || /\s/.test(trimmed[index + 6] ?? '')) + ) { + selectPos = index + } else if ( + selectPos !== -1 && + fromPos === -1 && + rest.startsWith('FROM') && + (index + 4 >= trimmed.length || /\s/.test(trimmed[index + 4] ?? '')) + ) { + fromPos = index } } if (selectPos === -1 || fromPos === -1) return query - const projStart = selectPos + 6 - const projText = trimmed.slice(projStart, fromPos).trim() + const projectionStart = selectPos + 6 + const rawProjection = trimmed.slice(projectionStart, fromPos).trim() + let projectionPrefix = '' + let projection = rawProjection + + const distinctMatch = rawProjection.match(/^DISTINCT\b\s*/i) + if (distinctMatch) { + projectionPrefix = distinctMatch[0] ?? '' + projection = rawProjection.slice(projectionPrefix.length).trim() + } const items: string[] = [] let itemStart = 0 depth = 0 - for (let i = 0; i < projText.length; i++) { - const ch = projText[i] - if (ch === '(') { depth++; continue } - if (ch === ')') { depth--; continue } - if (ch === "'") { - i++ - while (i < projText.length && projText[i] !== "'") { - if (projText[i] === '\\') i++ - i++ + for (let index = 0; index < projection.length; index++) { + const char = projection[index] + if (char === '(') { + depth += 1 + continue + } + if (char === ')') { + depth -= 1 + continue + } + if (char === '\'') { + index += 1 + while (index < projection.length && projection[index] !== '\'') { + if (projection[index] === '\\') index += 1 + index += 1 } continue } - if (depth === 0 && ch === ',') { - items.push(projText.slice(itemStart, i).trim()) - itemStart = i + 1 + if (depth === 0 && char === ',') { + items.push(projection.slice(itemStart, index).trim()) + itemStart = index + 1 } } - items.push(projText.slice(itemStart).trim()) + items.push(projection.slice(itemStart).trim()) const aliasMap = new Map() for (const item of items) { @@ -210,38 +351,43 @@ export function rewriteSelectColumns(query: string, targetColumns: string[]): st const itemUpper = item.toUpperCase() let asPos = -1 - let d = 0 - - for (let i = 0; i < item.length; i++) { - const ch = item[i] - if (ch === '(') { d++; continue } - if (ch === ')') { d--; continue } - if (ch === "'") { - i++ - while (i < item.length && item[i] !== "'") { - if (item[i] === '\\') i++ - i++ + let itemDepth = 0 + + for (let index = 0; index < item.length; index++) { + const char = item[index] + if (char === '(') { + itemDepth += 1 + continue + } + if (char === ')') { + itemDepth -= 1 + continue + } + if (char === '\'') { + index += 1 + while (index < item.length && item[index] !== '\'') { + if (item[index] === '\\') index += 1 + index += 1 } continue } - if (d !== 0) continue - if (i > 0 && /\S/.test(item[i - 1] ?? '')) continue - - const rest = itemUpper.slice(i) - if (rest.startsWith('AS') && (i + 2 >= item.length || /\s/.test(item[i + 2] ?? ''))) { - asPos = i + if (itemDepth !== 0) continue + if (index > 0 && /\S/.test(item[index - 1] ?? '')) continue + + const rest = itemUpper.slice(index) + if ( + rest.startsWith('AS') && + (index + 2 >= item.length || /\s/.test(item[index + 2] ?? '')) + ) { + asPos = index } } if (asPos !== -1) { - const alias = item.slice(asPos + 2).trim() - aliasMap.set(alias, item) + aliasMap.set(item.slice(asPos + 2).trim(), item) } } - const rewrittenCols = targetColumns.map(col => aliasMap.get(col) ?? col) - - const before = trimmed.slice(0, projStart) - const after = trimmed.slice(fromPos) - return `${before} ${rewrittenCols.join(', ')}\n${after}` + const rewrittenProjection = targetColumns.map((column) => aliasMap.get(column) ?? column) + return `${trimmed.slice(0, projectionStart)} ${projectionPrefix}${rewrittenProjection.join(', ')}\n${trimmed.slice(fromPos)}` } diff --git a/packages/plugin-backfill/src/chunking/strategies/equal-width-split.ts b/packages/plugin-backfill/src/chunking/strategies/equal-width-split.ts new file mode 100644 index 0000000..351e6ad --- /dev/null +++ b/packages/plugin-backfill/src/chunking/strategies/equal-width-split.ts @@ -0,0 +1,73 @@ +import pMap from 'p-map' +import { buildSliceFromRows } from '../partition-slices.js' +import { estimateRows } from '../services/row-probe.js' +import type { + Partition, + PartitionSlice, + PlannerContext, + SortKey, +} from '../types.js' +import { replaceChunkRange } from '../utils/ranges.js' +import { buildEvenlySpacedBoundaries } from './quantile-range-split.js' + +export const DEFAULT_OVERSAMPLING_MULTIPLIER = 3 +const ESTIMATE_CONCURRENCY = 50 + +export async function splitSliceWithEqualWidthRanges( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, + rangeFrom: string, + rangeTo: string, + subCount: number, + oversamplingMultiplier: number = DEFAULT_OVERSAMPLING_MULTIPLIER, +): Promise { + const sortKey = sortKeys[dimensionIndex] + if (!sortKey) return [slice] + + const boundaries = Array.from( + new Set(buildEvenlySpacedBoundaries(rangeFrom, rangeTo, subCount * oversamplingMultiplier, sortKey)) + ) + if (boundaries.length <= 2) return [slice] + + const intervals: Array<{ from: string; to: string }> = [] + for (let index = 0; index < boundaries.length - 1; index++) { + const from = boundaries[index] + const to = boundaries[index + 1] + if (from === undefined || to === undefined || from === to) continue + intervals.push({ from, to }) + } + + const results = await pMap( + intervals, + async ({ from, to }) => { + const ranges = replaceChunkRange(slice, dimensionIndex, from, to) + const rows = await estimateRows( + context, + { partitionId: partition.partitionId, ranges }, + sortKeys, + ) + if (rows <= 0) return null + return buildSliceFromRows(partition, { + ranges, + rows, + focusedValue: slice.analysis.focusedValue, + confidence: context.rowProbeStrategy === 'count' ? 'exact' : 'low', + reason: context.rowProbeStrategy === 'count' ? 'exact-count' : 'equal-width-distribution', + lineage: slice.analysis.lineage.concat([ + { + strategyId: 'equal-width-split', + dimensionIndex, + reason: 'fallback to equal-width ranges', + }, + ]), + }) + }, + { concurrency: ESTIMATE_CONCURRENCY }, + ) + + const slices = results.filter((s): s is PartitionSlice => s !== null) + return slices.length > 0 ? slices : [slice] +} diff --git a/packages/plugin-backfill/src/chunking/strategies/group-by-key-split.ts b/packages/plugin-backfill/src/chunking/strategies/group-by-key-split.ts new file mode 100644 index 0000000..02807e8 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/strategies/group-by-key-split.ts @@ -0,0 +1,93 @@ +import { buildSliceFromRows } from '../partition-slices.js' +import { + type StringKeyBucket, + probeStringKeyDistribution, +} from '../services/distribution-source.js' +import type { + Partition, + PartitionSlice, + PlannerContext, + SortKey, +} from '../types.js' +import { compareBinaryStrings, maxBinaryString, minBinaryString } from '../utils/binary-string.js' +import { getChunkRange, replaceChunkRange } from '../utils/ranges.js' + +const KEY_LIMIT = 100 + +export async function splitSliceWithGroupByKey( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, +): Promise { + const sortKey = sortKeys[dimensionIndex] + if (!sortKey || sortKey.category !== 'string') return undefined + + const range = getChunkRange(slice, dimensionIndex) + if (range.from === undefined || range.to === undefined) return undefined + + const buckets = await probeStringKeyDistribution( + context, + slice.partitionId, + slice.ranges, + sortKey, + dimensionIndex, + sortKeys, + KEY_LIMIT, + ) + + if (!buckets || buckets.length === 0) return undefined + + // Sort by value for range-ordered slice construction + const sorted = [...buckets].sort((a, b) => compareBinaryStrings(a.value, b.value)) + + return buildKeySlices(partition, slice, dimensionIndex, range.from, range.to, sorted) +} + +function buildKeySlices( + partition: Partition, + parentSlice: PartitionSlice, + dimensionIndex: number, + rangeFrom: string, + rangeTo: string, + sortedBuckets: StringKeyBucket[], +): PartitionSlice[] { + const slices: PartitionSlice[] = [] + let cursor = rangeFrom + + for (const bucket of sortedBuckets) { + const keyFrom = bucket.value + const keyTo = `${bucket.value}\0` + + // Gap slice before this key (non-hot residual between keys) + const gapFrom = maxBinaryString(cursor, rangeFrom) + const gapTo = minBinaryString(keyFrom, rangeTo) + if (compareBinaryStrings(gapFrom, gapTo) < 0) { + // There's a gap — but it has zero rows in our full distribution, + // so we skip it (all rows are accounted for by the key buckets) + } + + // Exact key slice + const sliceFrom = maxBinaryString(keyFrom, rangeFrom) + const sliceTo = minBinaryString(keyTo, rangeTo) + if (compareBinaryStrings(sliceFrom, sliceTo) < 0) { + slices.push(buildSliceFromRows(partition, { + ranges: replaceChunkRange(parentSlice, dimensionIndex, sliceFrom, sliceTo), + rows: bucket.rowCount, + focusedValue: { dimensionIndex, value: bucket.value }, + confidence: 'high', + reason: 'group-by-key-distribution', + lineage: parentSlice.analysis.lineage.concat([{ + strategyId: 'group-by-key-split', + dimensionIndex, + reason: 'split slice using full GROUP BY key distribution', + }]), + })) + } + + cursor = keyTo + } + + return slices +} diff --git a/packages/plugin-backfill/src/chunking/strategies/metadata-single-chunk.ts b/packages/plugin-backfill/src/chunking/strategies/metadata-single-chunk.ts new file mode 100644 index 0000000..86dca54 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/strategies/metadata-single-chunk.ts @@ -0,0 +1,6 @@ +import { buildRootSlice } from '../partition-slices.js' +import type { Partition, PartitionSlice } from '../types.js' + +export function buildSingleChunkPartition(partition: Partition): PartitionSlice[] { + return [buildRootSlice(partition)] +} diff --git a/packages/plugin-backfill/src/chunking/strategies/quantile-range-split.ts b/packages/plugin-backfill/src/chunking/strategies/quantile-range-split.ts new file mode 100644 index 0000000..d6558f8 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/strategies/quantile-range-split.ts @@ -0,0 +1,216 @@ +import pMap from 'p-map' +import { buildSliceFromRows } from '../partition-slices.js' +import { estimateRows, parsePlannerDateTime } from '../services/row-probe.js' +import type { + Partition, + PartitionSlice, + PlannerContext, + SortKey, +} from '../types.js' +import { bigIntToStr, strToBigInt } from '../utils/binary-string.js' +import { getChunkRange, replaceChunkRange } from '../utils/ranges.js' + +const BINARY_SEARCH_STEPS = 24 +const ESTIMATE_CONCURRENCY = 50 + +export async function splitSliceWithQuantiles( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, + boundaries: string[], +): Promise { + const intervals: Array<{ from: string; to: string }> = [] + for (let index = 0; index < boundaries.length - 1; index++) { + const from = boundaries[index] + const to = boundaries[index + 1] + if (from === undefined || to === undefined || from === to) continue + intervals.push({ from, to }) + } + + const results = await pMap( + intervals, + async ({ from, to }) => { + const ranges = replaceChunkRange(slice, dimensionIndex, from, to) + const rows = await estimateRows( + context, + { partitionId: partition.partitionId, ranges }, + sortKeys, + ) + if (rows <= 0) return null + return buildSliceFromRows(partition, { + ranges, + rows, + focusedValue: slice.analysis.focusedValue, + confidence: context.rowProbeStrategy === 'count' ? 'exact' : 'high', + reason: context.rowProbeStrategy === 'count' ? 'exact-count' : 'quantile-estimate', + lineage: slice.analysis.lineage.concat([ + { + strategyId: 'quantile-range-split', + dimensionIndex, + reason: 'split slice into quantile-aligned ranges', + }, + ]), + }) + }, + { concurrency: ESTIMATE_CONCURRENCY }, + ) + + return results.filter((s): s is PartitionSlice => s !== null) +} + +export async function findQuantileBoundaryOnDimension( + context: PlannerContext, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, + targetCumRows: number, +): Promise { + const sortKey = sortKeys[dimensionIndex] + if (!sortKey) { + throw new Error(`Missing sort key at dimension ${dimensionIndex}`) + } + + const range = getChunkRange(slice, dimensionIndex) + if (range.from === undefined || range.to === undefined) { + throw new Error(`Missing range for quantile split on dimension ${dimensionIndex}`) + } + + if (sortKey.category === 'string') { + return findStringBoundary(context, slice, sortKeys, dimensionIndex, range.from, range.to, targetCumRows) + } + if (sortKey.category === 'datetime') { + return findDateTimeBoundary(context, slice, sortKeys, dimensionIndex, range.from, range.to, targetCumRows) + } + return findNumericBoundary(context, slice, sortKeys, dimensionIndex, range.from, range.to, targetCumRows) +} + +export function buildEvenlySpacedBoundaries( + rangeFrom: string, + rangeTo: string, + subCount: number, + sortKey: SortKey, +): string[] { + if (subCount <= 1) return [rangeFrom, rangeTo] + + if (sortKey.category === 'datetime') { + const start = parsePlannerDateTime(rangeFrom) + const end = parsePlannerDateTime(rangeTo) + return Array.from({ length: subCount + 1 }, (_, index) => + new Date(start + Math.floor(((end - start) * index) / subCount)).toISOString() + ) + } + + if (sortKey.category === 'numeric') { + const start = Number(rangeFrom) + const end = Number(rangeTo) + return Array.from({ length: subCount + 1 }, (_, index) => + String(start + Math.floor(((end - start) * index) / subCount)) + ) + } + + const width = Math.max(rangeFrom.length, rangeTo.length) + const start = strToBigInt(rangeFrom, width) + const end = strToBigInt(rangeTo, width) + const boundaries = Array.from({ length: subCount + 1 }, (_, index) => + bigIntToStr(start + ((end - start) * BigInt(index)) / BigInt(subCount), width, width) + ) + // Use original values at endpoints to avoid round-trip length changes + boundaries[0] = rangeFrom + boundaries[boundaries.length - 1] = rangeTo + return boundaries +} + +async function findStringBoundary( + context: PlannerContext, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, + rangeFrom: string, + rangeTo: string, + targetCumRows: number, +): Promise { + const width = Math.max(rangeFrom.length, rangeTo.length) + let low = strToBigInt(rangeFrom, width) + let high = strToBigInt(rangeTo, width) + + for (let step = 0; step < BINARY_SEARCH_STEPS; step++) { + const midpoint = (low + high) / 2n + if (midpoint === low || midpoint === high) break + + const mid = bigIntToStr(midpoint, width, width) + const rows = await estimateRowsUntil(context, slice, sortKeys, dimensionIndex, rangeFrom, mid) + if (rows < targetCumRows) low = midpoint + else high = midpoint + } + + return bigIntToStr((low + high) / 2n, width, width) +} + +async function findDateTimeBoundary( + context: PlannerContext, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, + rangeFrom: string, + rangeTo: string, + targetCumRows: number, +): Promise { + let low = parsePlannerDateTime(rangeFrom) + let high = parsePlannerDateTime(rangeTo) + + for (let step = 0; step < BINARY_SEARCH_STEPS; step++) { + const midpoint = Math.floor((low + high) / 2) + if (midpoint === low || midpoint === high) break + + const mid = new Date(midpoint).toISOString() + const rows = await estimateRowsUntil(context, slice, sortKeys, dimensionIndex, rangeFrom, mid) + if (rows < targetCumRows) low = midpoint + else high = midpoint + } + + return new Date(Math.floor((low + high) / 2)).toISOString() +} + +async function findNumericBoundary( + context: PlannerContext, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, + rangeFrom: string, + rangeTo: string, + targetCumRows: number, +): Promise { + let low = Number(rangeFrom) + let high = Number(rangeTo) + + for (let step = 0; step < BINARY_SEARCH_STEPS; step++) { + const midpoint = Math.floor((low + high) / 2) + if (midpoint === low || midpoint === high) break + + const rows = await estimateRowsUntil(context, slice, sortKeys, dimensionIndex, rangeFrom, String(midpoint)) + if (rows < targetCumRows) low = midpoint + else high = midpoint + } + + return String(Math.floor((low + high) / 2)) +} + +async function estimateRowsUntil( + context: PlannerContext, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, + rangeFrom: string, + rangeTo: string, +): Promise { + return estimateRows( + context, + { + partitionId: slice.partitionId, + ranges: replaceChunkRange(slice, dimensionIndex, rangeFrom, rangeTo), + }, + sortKeys + ) +} diff --git a/packages/plugin-backfill/src/chunking/strategies/refinement.ts b/packages/plugin-backfill/src/chunking/strategies/refinement.ts new file mode 100644 index 0000000..4d27c2a --- /dev/null +++ b/packages/plugin-backfill/src/chunking/strategies/refinement.ts @@ -0,0 +1,128 @@ +import { buildSliceEstimate } from '../partition-slices.js' +import { countRowsExact, getRowProbeStrategy } from '../services/row-probe.js' +import type { + Partition, + PartitionBuildResult, + PartitionDiagnostics, + PartitionSlice, + PlannerContext, + SortKey, +} from '../types.js' + +const ESTIMATE_RATIO_MIN = 0.7 +const ESTIMATE_RATIO_MAX = 1.3 + +export async function refinePartitionSlices( + context: PlannerContext, + partition: Partition, + slices: PartitionSlice[], + sortKeys: SortKey[], + usedDistributionFallback: boolean, +): Promise { + let workingSlices = slices + let usedLowConfidenceChunkRefinement = false + + if (slices.some((slice) => slice.estimate.confidence === 'low')) { + workingSlices = await refineLowConfidenceSlices(context, partition, slices, sortKeys) + usedLowConfidenceChunkRefinement = true + } + + const diagnostics = buildPartitionDiagnostics( + partition, + workingSlices, + usedDistributionFallback, + usedLowConfidenceChunkRefinement, + false + ) + + if ( + getRowProbeStrategy(context) !== 'explain-estimate' || + !diagnostics.suspiciousEstimate + ) { + return { slices: workingSlices, diagnostics } + } + + const refinedSlices = await refineAllSlices(context, partition, workingSlices, sortKeys) + return { + slices: refinedSlices, + diagnostics: buildPartitionDiagnostics( + partition, + refinedSlices, + usedDistributionFallback, + usedLowConfidenceChunkRefinement, + true + ), + } +} + +export function buildPartitionDiagnostics( + partition: Partition, + slices: PartitionSlice[], + usedDistributionFallback: boolean, + usedLowConfidenceChunkRefinement: boolean, + usedExactCountFallback: boolean, +): PartitionDiagnostics { + const estimatedRowSum = slices.reduce((sum, slice) => sum + slice.estimate.rows, 0) + const estimateToExactRatio = partition.rows > 0 ? estimatedRowSum / partition.rows : 1 + + return { + estimatedRowSum, + exactPartitionRows: partition.rows, + estimateToExactRatio, + suspiciousEstimate: + estimateToExactRatio < ESTIMATE_RATIO_MIN || estimateToExactRatio > ESTIMATE_RATIO_MAX, + lowConfidenceChunkCount: slices.filter((slice) => slice.estimate.confidence === 'low').length, + usedDistributionFallback, + usedLowConfidenceChunkRefinement, + usedExactCountFallback, + } +} + +async function refineLowConfidenceSlices( + context: PlannerContext, + partition: Partition, + slices: PartitionSlice[], + sortKeys: SortKey[], +): Promise { + const refined: PartitionSlice[] = [] + + for (const slice of slices) { + if (slice.estimate.confidence !== 'low') { + refined.push(slice) + continue + } + refined.push(await refineSlice(context, partition, slice, sortKeys)) + } + + return refined +} + +async function refineAllSlices( + context: PlannerContext, + partition: Partition, + slices: PartitionSlice[], + sortKeys: SortKey[], +): Promise { + return Promise.all(slices.map((slice) => refineSlice(context, partition, slice, sortKeys))) +} + +async function refineSlice( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], +): Promise { + const rows = await countRowsExact( + context, + { + partitionId: partition.partitionId, + ranges: slice.ranges, + }, + sortKeys + ) + + return { + ...slice, + estimate: buildSliceEstimate(partition, rows, 'exact', 'exact-count'), + } +} diff --git a/packages/plugin-backfill/src/chunking/strategies/string-prefix-split.ts b/packages/plugin-backfill/src/chunking/strategies/string-prefix-split.ts new file mode 100644 index 0000000..5c050b6 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/strategies/string-prefix-split.ts @@ -0,0 +1,144 @@ +import { buildSliceFromRows } from '../partition-slices.js' +import { probeStringPrefixDistribution } from '../services/distribution-source.js' +import type { + Partition, + PartitionSlice, + PlannerContext, + SortKey, + StringPrefixBucket, +} from '../types.js' +import { + buildObservedStringUpperBound, + maxBinaryString, + minBinaryString, + nextPrefixValue, +} from '../utils/binary-string.js' +import { getChunkRange, replaceChunkRange } from '../utils/ranges.js' + +const TARGET_BYTES_FUZZ_FACTOR = 1.15 +const PREFIX_START_DEPTH = 1 +const PREFIX_MAX_DEPTH = 4 + +export async function splitSliceWithStringPrefixes( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, +): Promise { + const sortKey = sortKeys[dimensionIndex] + if (!sortKey || sortKey.category !== 'string') return [] + + const range = getChunkRange(slice, dimensionIndex) + if (range.from === undefined || range.to === undefined) return [] + + return buildPrefixSlices( + context, + partition, + slice, + sortKeys, + dimensionIndex, + range.from, + range.to, + PREFIX_START_DEPTH + ) +} + +export function buildRootStringUpperBound(maxValue: string): string { + return buildObservedStringUpperBound(maxValue) +} + +async function buildPrefixSlices( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, + rangeFrom: string, + rangeTo: string, + depth: number, +): Promise { + const sortKey = sortKeys[dimensionIndex] + if (!sortKey) return [] + + const buckets = await probeStringPrefixDistribution( + context, + partition.partitionId, + replaceChunkRange(slice, dimensionIndex, rangeFrom, rangeTo), + sortKey, + dimensionIndex, + depth, + sortKeys + ) + + const slices: PartitionSlice[] = [] + for (const bucket of buckets) { + if (bucket.rowCount <= 0) continue + + const bucketSlice = buildBucketSlice(partition, slice, dimensionIndex, rangeFrom, rangeTo, bucket) + if (!bucketSlice) continue + + if (bucketSlice.estimate.bytesUncompressed <= context.targetChunkBytes * TARGET_BYTES_FUZZ_FACTOR) { + slices.push(bucketSlice) + continue + } + + if (!bucket.isExactValue && depth < PREFIX_MAX_DEPTH) { + const bucketRange = getChunkRange(bucketSlice, dimensionIndex) + if (bucketRange.from !== undefined && bucketRange.to !== undefined) { + slices.push( + ...(await buildPrefixSlices( + context, + partition, + slice, + sortKeys, + dimensionIndex, + bucketRange.from, + bucketRange.to, + depth + 1 + )) + ) + continue + } + } + + slices.push(bucketSlice) + } + + return slices +} + +function buildBucketSlice( + partition: Partition, + parentSlice: PartitionSlice, + dimensionIndex: number, + rangeFrom: string, + rangeTo: string, + bucket: StringPrefixBucket, +): PartitionSlice | undefined { + const bucketFrom = maxBinaryString(rangeFrom, bucket.value) + const bucketUpper = bucket.isExactValue ? `${bucket.value}\0` : nextPrefixValue(bucket.value) + if (bucketUpper === undefined) return undefined + + const bucketTo = minBinaryString(rangeTo, bucketUpper) + if (bucketFrom === bucketTo) return undefined + + const focusedValue = bucket.isExactValue + ? { dimensionIndex, value: bucket.value } + : parentSlice.analysis.focusedValue + + return buildSliceFromRows(partition, { + ranges: replaceChunkRange(parentSlice, dimensionIndex, bucketFrom, bucketTo), + rows: bucket.rowCount, + focusedValue, + confidence: 'high', + reason: 'string-prefix-distribution', + lineage: parentSlice.analysis.lineage.concat([ + { + strategyId: 'string-prefix-split', + dimensionIndex, + reason: 'split slice using string prefix distribution', + }, + ]), + }) +} diff --git a/packages/plugin-backfill/src/chunking/strategies/temporal-bucket-split.ts b/packages/plugin-backfill/src/chunking/strategies/temporal-bucket-split.ts new file mode 100644 index 0000000..5b83f6a --- /dev/null +++ b/packages/plugin-backfill/src/chunking/strategies/temporal-bucket-split.ts @@ -0,0 +1,117 @@ +import { buildSliceFromRows, getTargetChunkRows } from '../partition-slices.js' +import { probeTemporalDistribution } from '../services/distribution-source.js' +import { parsePlannerDateTime } from '../services/row-probe.js' +import type { + Partition, + PartitionSlice, + PlannerContext, + SortKey, + TemporalBucket, +} from '../types.js' +import { getChunkRange, replaceChunkRange } from '../utils/ranges.js' + +const TARGET_BYTES_FUZZ_FACTOR = 1.15 + +export async function splitSliceWithTemporalBuckets( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, +): Promise { + const dayBuckets = await probeTemporalDistribution( + context, + partition.partitionId, + slice.ranges, + sortKeys, + dimensionIndex, + 'day' + ) + if (dayBuckets.length === 0) return [slice] + + const daySlices = buildTemporalSlices(partition, slice, dimensionIndex, dayBuckets, context.targetChunkBytes) + if (daySlices.every((candidate) => candidate.estimate.bytesUncompressed <= context.targetChunkBytes * TARGET_BYTES_FUZZ_FACTOR)) { + return daySlices + } + + const hourBuckets = await probeTemporalDistribution( + context, + partition.partitionId, + slice.ranges, + sortKeys, + dimensionIndex, + 'hour' + ) + if (hourBuckets.length === 0) return daySlices + + return buildTemporalSlices(partition, slice, dimensionIndex, hourBuckets, context.targetChunkBytes) +} + +export function getPartitionEndExclusive(partition: Partition): string { + return new Date(parsePlannerDateTime(partition.maxTime) + 1000).toISOString() +} + +function buildTemporalSlices( + partition: Partition, + parentSlice: PartitionSlice, + dimensionIndex: number, + buckets: TemporalBucket[], + targetChunkBytes: number, +): PartitionSlice[] { + const targetChunkRows = getTargetChunkRows(partition, targetChunkBytes) + const slices: PartitionSlice[] = [] + let currentStart: string | undefined + let currentRows = 0 + const parentRange = getChunkRange(parentSlice, dimensionIndex) + const sliceStart = parentRange.from + const sliceEnd = parentRange.to ?? getPartitionEndExclusive(partition) + + for (let index = 0; index < buckets.length; index++) { + const bucket = buckets[index] + if (!bucket) continue + + const bucketStart = sliceStart && bucket.start < sliceStart ? sliceStart : bucket.start + if (currentStart === undefined) { + currentStart = bucketStart + } + + const wouldExceed = currentRows > 0 && currentRows + bucket.rowCount > targetChunkRows * TARGET_BYTES_FUZZ_FACTOR + if (wouldExceed && currentStart !== undefined && currentStart < bucketStart) { + slices.push(buildSlice(parentSlice, partition, dimensionIndex, currentStart, bucketStart, currentRows)) + currentStart = bucketStart + currentRows = 0 + } + + currentRows += bucket.rowCount + + if (index === buckets.length - 1 && currentStart !== undefined && currentStart < sliceEnd) { + slices.push(buildSlice(parentSlice, partition, dimensionIndex, currentStart, sliceEnd, currentRows)) + } + } + + return slices.length > 0 ? slices : [parentSlice] +} + +function buildSlice( + parentSlice: PartitionSlice, + partition: Partition, + dimensionIndex: number, + from: string, + to: string, + rows: number, +): PartitionSlice { + return buildSliceFromRows(partition, { + ranges: replaceChunkRange(parentSlice, dimensionIndex, from, to), + rows, + focusedValue: parentSlice.analysis.focusedValue, + confidence: 'low', + reason: 'temporal-distribution', + lineage: parentSlice.analysis.lineage.concat([ + { + strategyId: 'temporal-bucket-split', + dimensionIndex, + reason: 'split slice using temporal distribution buckets', + }, + ]), + }) +} diff --git a/packages/plugin-backfill/src/chunking/strategy-policy.test.ts b/packages/plugin-backfill/src/chunking/strategy-policy.test.ts new file mode 100644 index 0000000..04880b7 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/strategy-policy.test.ts @@ -0,0 +1,13 @@ +import { describe, expect, test } from 'bun:test' + +import { getCandidateDimensions } from './strategy-policy.js' + +describe('getCandidateDimensions', () => { + test('preserves declared sort-key order regardless of type', () => { + expect(getCandidateDimensions([ + { name: 'event_time', type: 'DateTime', category: 'datetime', boundaryEncoding: 'literal' }, + { name: 'account_id', type: 'String', category: 'string', boundaryEncoding: 'hex-latin1' }, + { name: 'seq', type: 'UInt64', category: 'numeric', boundaryEncoding: 'literal' }, + ])).toEqual([0, 1, 2]) + }) +}) diff --git a/packages/plugin-backfill/src/chunking/strategy-policy.ts b/packages/plugin-backfill/src/chunking/strategy-policy.ts new file mode 100644 index 0000000..0b1a4d1 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/strategy-policy.ts @@ -0,0 +1,8 @@ +import type { PartitionSlice, SortKey } from './types.js' + +export function getCandidateDimensions( + sortKeys: SortKey[], + _slice?: PartitionSlice, +): number[] { + return sortKeys.map((_, index) => index) +} diff --git a/packages/plugin-backfill/src/chunking/types.ts b/packages/plugin-backfill/src/chunking/types.ts index 0c76952..6cd7910 100644 --- a/packages/plugin-backfill/src/chunking/types.ts +++ b/packages/plugin-backfill/src/chunking/types.ts @@ -1,31 +1,170 @@ -export interface PartitionInfo { +export type RowProbeStrategy = 'explain-estimate' | 'count' + +export type SortKeyCategory = 'numeric' | 'datetime' | 'string' + +export type SortKeyBoundaryEncoding = 'literal' | 'hex-latin1' + +export type EstimateConfidence = 'high' | 'low' | 'exact' + +export type EstimateReason = + | 'partition-metadata' + | 'quantile-estimate' + | 'string-prefix-distribution' + | 'group-by-key-distribution' + | 'temporal-distribution' + | 'equal-width-distribution' + | 'exact-count' + +export interface SortKey { + name: string + type: string + category: SortKeyCategory + boundaryEncoding: SortKeyBoundaryEncoding +} + +export interface ChunkRange { + dimensionIndex: number + from?: string + to?: string +} + +export interface ChunkDerivationStep { + strategyId: string + dimensionIndex?: number + reason: string +} + +export interface ChunkEstimate { + rows: number + bytesCompressed: number + bytesUncompressed: number + confidence: EstimateConfidence + reason: EstimateReason +} + +export interface FocusedValue { + dimensionIndex: number + value: string +} + +export interface ChunkAnalysis { + focusedValue?: FocusedValue + lineage: ChunkDerivationStep[] +} + +export interface Chunk { + id: string + partitionId: string + ranges: ChunkRange[] + estimate: ChunkEstimate + analysis: ChunkAnalysis +} + +export interface PartitionDiagnostics { + estimatedRowSum: number + exactPartitionRows: number + estimateToExactRatio: number + suspiciousEstimate: boolean + lowConfidenceChunkCount: number + usedDistributionFallback: boolean + usedLowConfidenceChunkRefinement: boolean + usedExactCountFallback: boolean +} + +export interface Partition { partitionId: string rows: number - bytesOnDisk: number + bytesCompressed: number + bytesUncompressed: number minTime: string maxTime: string + diagnostics?: PartitionDiagnostics } -export interface SortKeyInfo { - column: string - type: string - category: 'numeric' | 'datetime' | 'string' +export interface TableProfile { + database: string + table: string + sortKeys: SortKey[] +} + +export interface ChunkPlanStats { + totalPartitions: number + oversizedPartitions: number + focusedChunks: number + totalChunks: number + avgChunkBytes: number + maxChunkBytes: number + minChunkBytes: number +} + +export interface ChunkPlan { + planId: string + generatedAt: string + rowProbeStrategy: RowProbeStrategy + targetChunkBytes: number + table: TableProfile + partitions: Partition[] + chunks: Chunk[] + totalRows: number + totalBytesCompressed: number + totalBytesUncompressed: number + stats: ChunkPlanStats +} + +export type PlannerQuery = (sql: string, settings?: Record) => Promise + +export interface PlannerContext { + database: string + table: string + from?: string + to?: string + targetChunkBytes: number + query: PlannerQuery + querySettings?: Record + rowProbeStrategy: RowProbeStrategy } -export interface ChunkBoundary { +export interface EstimateFilter { partitionId: string - sortKeyFrom?: string - sortKeyTo?: string - estimatedBytes: number + ranges: ChunkRange[] + exactDimensionIndex?: number + exactValue?: string } -export interface PlannedChunk { - id: string +export interface StringPrefixBucket { + value: string + rowCount: number + isExactValue: boolean +} + +export interface TemporalBucket { + start: string + rowCount: number +} + +export interface PartitionSlice { partitionId: string - sortKeyFrom?: string - sortKeyTo?: string - estimatedBytes: number - idempotencyToken: string - from: string - to: string + ranges: ChunkRange[] + estimate: ChunkEstimate + analysis: ChunkAnalysis +} + +export interface PartitionBuildResult { + slices: PartitionSlice[] + diagnostics: PartitionDiagnostics +} + +export interface PlanChunkOptions { + requireIdempotencyToken: boolean +} + +export interface GenerateChunkPlanInput { + database: string + table: string + from?: string + to?: string + targetChunkBytes: number + query: PlannerQuery + querySettings?: Record + rowProbeStrategy?: RowProbeStrategy } diff --git a/packages/plugin-backfill/src/chunking/utils/binary-string.ts b/packages/plugin-backfill/src/chunking/utils/binary-string.ts new file mode 100644 index 0000000..b264f2d --- /dev/null +++ b/packages/plugin-backfill/src/chunking/utils/binary-string.ts @@ -0,0 +1,62 @@ +export function compareBinaryStrings(left: string, right: string): number { + return Buffer.from(left, 'latin1').compare(Buffer.from(right, 'latin1')) +} + +export function minBinaryString(left: string, right: string): string { + return compareBinaryStrings(left, right) <= 0 ? left : right +} + +export function maxBinaryString(left: string, right: string): string { + return compareBinaryStrings(left, right) >= 0 ? left : right +} + +export function nextPrefixValue(prefix: string): string | undefined { + if (prefix.length === 0) return undefined + + const buffer = Buffer.from(prefix, 'latin1') + for (let index = buffer.length - 1; index >= 0; index--) { + const byte = buffer[index] + if (byte === undefined || byte === 0xff) continue + + const next = Buffer.from(buffer.subarray(0, index + 1)) + next[index] = byte + 1 + return next.toString('latin1') + } + + return undefined +} + +export function buildObservedStringUpperBound(maxValue: string): string { + return `${maxValue}\0` +} + +export function strToBigInt(value: string, padTo: number): bigint { + const buffer = Buffer.from(value, 'latin1') + let result = 0n + + for (let index = 0; index < padTo; index++) { + const byte = index < buffer.length ? (buffer[index] ?? 0) : 0 + result = (result << 8n) | BigInt(byte) + } + + return result +} + +export function bigIntToStr(value: bigint, length: number, minLength = 0): string { + const buffer = Buffer.alloc(length) + let remaining = value + + for (let index = length - 1; index >= 0; index--) { + buffer[index] = Number(remaining & 0xffn) + remaining >>= 8n + } + + // Strip trailing null bytes so boundaries match real string values + // in ClickHouse comparisons (where "abc" < "abc\0"), but preserve + // at least minLength bytes to avoid losing meaningful trailing nulls + // (e.g. from buildObservedStringUpperBound which appends "\0"). + let end = length + while (end > minLength && buffer[end - 1] === 0) end-- + + return buffer.subarray(0, end).toString('latin1') +} diff --git a/packages/plugin-backfill/src/chunking/utils/ids.ts b/packages/plugin-backfill/src/chunking/utils/ids.ts new file mode 100644 index 0000000..32c0ff3 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/utils/ids.ts @@ -0,0 +1,17 @@ +import { hashId, randomPlanId } from '../../state.js' + +export function generatePlanId(): string { + return randomPlanId() +} + +export function generateChunkId( + planId: string, + partitionId: string, + index: number, +): string { + return hashId(`chunk:${planId}:${partitionId}:${index}`).slice(0, 16) +} + +export function generateIdempotencyToken(planId: string, chunkId: string): string { + return hashId(`token:${planId}:${chunkId}`) +} diff --git a/packages/plugin-backfill/src/chunking/utils/ranges.ts b/packages/plugin-backfill/src/chunking/utils/ranges.ts new file mode 100644 index 0000000..3af1571 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/utils/ranges.ts @@ -0,0 +1,31 @@ +import type { ChunkRange, PartitionSlice } from '../types.js' + +export function getChunkRange( + slice: Pick, + dimensionIndex: number, +): ChunkRange { + return ( + slice.ranges.find((range) => range.dimensionIndex === dimensionIndex) ?? { + dimensionIndex, + from: undefined, + to: undefined, + } + ) +} + +export function replaceChunkRange( + slice: Pick, + dimensionIndex: number, + from: string | undefined, + to: string | undefined, +): ChunkRange[] { + return slice.ranges + .filter((range) => range.dimensionIndex !== dimensionIndex) + .concat([{ dimensionIndex, from, to }]) + .sort((left, right) => left.dimensionIndex - right.dimensionIndex) +} + +export function isExactChunkRange(range: Pick): boolean { + if (range.from === undefined || range.to === undefined) return false + return range.to === `${range.from}\0` +} diff --git a/packages/plugin-backfill/src/index.ts b/packages/plugin-backfill/src/index.ts index 3420da7..c2c8446 100644 --- a/packages/plugin-backfill/src/index.ts +++ b/packages/plugin-backfill/src/index.ts @@ -1,16 +1,6 @@ import './table-config.js' export { backfill, createBackfillPlugin } from './plugin.js' -export { executeBackfill, syncProgress } from './async-backfill.js' -export { analyzeAndChunk } from './chunking/analyze.js' -export type { - BackfillOptions, - BackfillChunkState, - BackfillProgress, - BackfillResult, -} from './async-backfill.js' export type { BackfillPlugin, BackfillPluginOptions, BackfillPluginRegistration } from './types.js' export type { PluginConfig } from './options.js' export type { BackfillTableConfig } from './table-config.js' -export type { AnalyzeAndChunkInput, AnalyzeAndChunkResult } from './chunking/analyze.js' -export type { PlannedChunk, PartitionInfo, SortKeyInfo } from './chunking/types.js' diff --git a/packages/plugin-backfill/src/logging.ts b/packages/plugin-backfill/src/logging.ts new file mode 100644 index 0000000..b2765a9 --- /dev/null +++ b/packages/plugin-backfill/src/logging.ts @@ -0,0 +1,60 @@ +import { getLogger, type Logger } from '@logtape/logtape' + +export const CHKIT_LOGGER_CATEGORY = ['chkit'] as const +export const CHKIT_BACKFILL_LOGGER_CATEGORY = [...CHKIT_LOGGER_CATEGORY, 'backfill'] as const +export const SLOW_CLICKHOUSE_QUERY_MS = 5000 +export const SLOW_CLICKHOUSE_QUERY_REPEAT_INITIAL_MS = 5000 +export const SLOW_CLICKHOUSE_QUERY_REPEAT_MAX_MS = 30000 + +export function getBackfillLogger(...segments: string[]): Logger { + return getLogger([...CHKIT_BACKFILL_LOGGER_CATEGORY, ...segments]) +} + +export function formatBytes(bytes: number): string { + if (bytes >= 1024 ** 4) return `${(bytes / 1024 ** 4).toFixed(1)} TiB` + if (bytes >= 1024 ** 3) return `${(bytes / 1024 ** 3).toFixed(1)} GiB` + if (bytes >= 1024 ** 2) return `${(bytes / 1024 ** 2).toFixed(1)} MiB` + if (bytes >= 1024) return `${(bytes / 1024).toFixed(1)} KiB` + return `${bytes} B` +} + +export function summarizeSql(sql: string, maxLength = 240): string { + const normalized = normalizeSql(sql) + if (normalized.length <= maxLength) return normalized + return `${normalized.slice(0, maxLength - 3)}...` +} + +export function describeSqlOperation(sql: string): string { + const normalized = normalizeSql(sql) + + const prefixDistribution = normalized.match(/^SELECT substring\((\w+), 1, \d+\) AS prefix, count\(\) AS cnt /) + if (prefixDistribution?.[1]) return `prefix distribution on ${prefixDistribution[1]}` + + const temporalDistribution = normalized.match(/^SELECT formatDateTime\(toStartOf(Day|Hour)\((\w+)\)/) + if (temporalDistribution?.[1] && temporalDistribution[2]) { + return `${temporalDistribution[1].toLowerCase()} distribution on ${temporalDistribution[2]}` + } + + const minMaxProbe = normalized.match(/^SELECT toString\(min\((\w+)\)\) AS minVal, toString\(max\(\1\)\) AS maxVal /) + if (minMaxProbe?.[1]) return `range probe on ${minMaxProbe[1]}` + + if (normalized.startsWith('SELECT count() AS cnt FROM ')) return 'row count probe' + if (normalized.startsWith('SELECT sorting_key FROM system.tables')) return 'sort key introspection' + if (normalized.startsWith('SELECT name, type FROM system.columns')) return 'column introspection' + if (normalized.startsWith('SELECT partition_id,')) return 'partition introspection' + if (normalized.startsWith('SELECT 1 FROM ')) return 'table existence probe' + + return summarizeSql(normalized, 100) +} + +export function describeSqlContext(sql: string): string | undefined { + const normalized = normalizeSql(sql) + const partitionId = normalized.match(/_partition_id = '([^']+)'/)?.[1] + + if (partitionId) return `partition ${partitionId}` + return undefined +} + +function normalizeSql(sql: string): string { + return sql.replace(/\s+/g, ' ').trim() +} diff --git a/packages/plugin-backfill/src/partition-planner.test.ts b/packages/plugin-backfill/src/partition-planner.test.ts deleted file mode 100644 index b10c8da..0000000 --- a/packages/plugin-backfill/src/partition-planner.test.ts +++ /dev/null @@ -1,185 +0,0 @@ -import { describe, expect, test } from 'bun:test' - -import { buildChunkBoundaries } from './chunking/build.js' -import { buildChunkSql } from './chunking/sql.js' -import { buildPlannedChunks } from './chunking/analyze.js' -import type { PartitionInfo, SortKeyInfo } from './types.js' - -const GiB = 1024 ** 3 - -function buildChunksWithSql(input: { - planId: string - target: string - partitions: PartitionInfo[] - maxChunkBytes: number - sortKey?: SortKeyInfo - sortKeyRanges?: Map - requireIdempotencyToken: boolean - mvAsQuery?: string - targetColumns?: string[] -}) { - const boundaries = buildChunkBoundaries({ - partitions: input.partitions, - maxChunkBytes: input.maxChunkBytes, - sortKey: input.sortKey, - sortKeyRanges: input.sortKeyRanges, - }) - - const planned = buildPlannedChunks({ - planId: input.planId, - partitions: input.partitions, - boundaries, - requireIdempotencyToken: input.requireIdempotencyToken, - }) - - return planned.map(chunk => ({ - ...chunk, - sqlTemplate: buildChunkSql({ - planId: input.planId, - chunk, - target: input.target, - sortKey: input.sortKey, - mvAsQuery: input.mvAsQuery, - targetColumns: input.targetColumns, - }), - })) -} - -describe('buildChunksWithSql', () => { - const basePlanId = 'abc1234567890123' - - test('small partition produces one chunk with _partition_id filter only', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 1000, bytesOnDisk: 5 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T23:59:59.000Z' }, - ] - - const chunks = buildChunksWithSql({ - planId: basePlanId, - target: 'default.events', - partitions, - maxChunkBytes: 10 * GiB, - requireIdempotencyToken: true, - }) - - expect(chunks).toHaveLength(1) - expect(chunks[0]?.sqlTemplate).toContain("WHERE _partition_id = '202501'") - expect(chunks[0]?.partitionId).toBe('202501') - expect(chunks[0]?.estimatedBytes).toBe(5 * GiB) - }) - - test('large partition with datetime sort key produces sub-chunks with parseDateTimeBestEffort', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 10000, bytesOnDisk: 30 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - ] - const sortKey: SortKeyInfo = { column: 'event_time', type: 'DateTime', category: 'datetime' } - const sortKeyRanges = new Map([ - ['202501', { min: '2025-01-01 00:00:00', max: '2025-01-31 00:00:00' }], - ]) - - const chunks = buildChunksWithSql({ - planId: basePlanId, - target: 'default.events', - partitions, - maxChunkBytes: 10 * GiB, - sortKey, - sortKeyRanges, - requireIdempotencyToken: true, - }) - - expect(chunks).toHaveLength(3) - for (const chunk of chunks) { - expect(chunk.sqlTemplate).toContain("WHERE _partition_id = '202501'") - expect(chunk.sqlTemplate).toContain('event_time >= parseDateTimeBestEffort(') - expect(chunk.sqlTemplate).toContain('event_time < parseDateTimeBestEffort(') - expect(chunk.partitionId).toBe('202501') - } - }) - - test('chunk IDs are deterministic for same input', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 1000, bytesOnDisk: 5 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - ] - - const first = buildChunksWithSql({ - planId: basePlanId, - target: 'default.events', - partitions, - maxChunkBytes: 10 * GiB, - requireIdempotencyToken: true, - }) - - const second = buildChunksWithSql({ - planId: basePlanId, - target: 'default.events', - partitions, - maxChunkBytes: 10 * GiB, - requireIdempotencyToken: true, - }) - - expect(first[0]?.id).toBe(second[0]?.id) - expect(first[0]?.idempotencyToken).toBe(second[0]?.idempotencyToken) - }) - - test('idempotency tokens are empty when not required', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 1000, bytesOnDisk: 5 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - ] - - const chunks = buildChunksWithSql({ - planId: basePlanId, - target: 'default.events', - partitions, - maxChunkBytes: 10 * GiB, - requireIdempotencyToken: false, - }) - - expect(chunks[0]?.idempotencyToken).toBe('') - expect(chunks[0]?.sqlTemplate).not.toContain('insert_deduplication_token') - }) - - test('SQL templates include correct INSERT and SELECT structure', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 1000, bytesOnDisk: 5 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - ] - - const chunks = buildChunksWithSql({ - planId: basePlanId, - target: 'default.events', - partitions, - maxChunkBytes: 10 * GiB, - requireIdempotencyToken: true, - }) - - const sql = chunks[0]?.sqlTemplate ?? '' - expect(sql).toContain(`/* chkit backfill plan=${basePlanId}`) - expect(sql).toContain('INSERT INTO default.events') - expect(sql).toContain('SELECT *') - expect(sql).toContain('FROM default.events') - expect(sql).toContain('SETTINGS async_insert=0') - }) - - test('numeric sort key sub-chunks use direct comparison', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 10000, bytesOnDisk: 20 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - ] - const sortKey: SortKeyInfo = { column: 'id', type: 'UInt64', category: 'numeric' } - const sortKeyRanges = new Map([ - ['202501', { min: '100', max: '200' }], - ]) - - const chunks = buildChunksWithSql({ - planId: basePlanId, - target: 'default.events', - partitions, - maxChunkBytes: 10 * GiB, - sortKey, - sortKeyRanges, - requireIdempotencyToken: false, - }) - - expect(chunks).toHaveLength(2) - expect(chunks[0]?.sqlTemplate).toContain("id >= '100'") - expect(chunks[0]?.sqlTemplate).toContain("id < '150'") - expect(chunks[0]?.sqlTemplate).not.toContain('parseDateTimeBestEffort') - }) -}) diff --git a/packages/plugin-backfill/src/payload.ts b/packages/plugin-backfill/src/payload.ts index f17e096..f79d2aa 100644 --- a/packages/plugin-backfill/src/payload.ts +++ b/packages/plugin-backfill/src/payload.ts @@ -27,15 +27,13 @@ export function planPayload(output: BuildBackfillPlanOutput): { target: output.plan.target, from: output.plan.from, to: output.plan.to, - chunkCount: output.plan.chunks.length, + chunkCount: output.plan.chunkPlan.chunks.length, maxChunkBytes: output.plan.options.maxChunkBytes, sortKeyColumn: output.plan.options.sortKeyColumn, planPath: output.planPath, - strategy: output.plan.strategy, - partitionCount: output.plan.partitions?.length, - totalBytes: output.plan.partitions - ? output.plan.partitions.reduce((sum, p) => sum + p.bytesOnDisk, 0) - : undefined, + strategy: output.plan.execution.mode, + partitionCount: output.plan.chunkPlan.partitions.length, + totalBytes: output.plan.chunkPlan.totalBytesCompressed, } } diff --git a/packages/plugin-backfill/src/planner.test.ts b/packages/plugin-backfill/src/planner.test.ts index be8f8cb..4d83103 100644 --- a/packages/plugin-backfill/src/planner.test.ts +++ b/packages/plugin-backfill/src/planner.test.ts @@ -1,39 +1,52 @@ import { describe, expect, test } from 'bun:test' -import { mkdtemp, readFile, rm, writeFile } from 'node:fs/promises' -import { join, resolve } from 'node:path' +import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises' +import { dirname, join, resolve } from 'node:path' import { tmpdir } from 'node:os' import { resolveConfig } from '@chkit/core' +import { buildChunkExecutionSql, rewriteSelectColumns } from './chunking/sql.js' +import { generateIdempotencyToken } from './chunking/utils/ids.js' import { PlanSchema } from './options.js' import { buildBackfillPlan } from './planner.js' -import { injectSortKeyFilter, rewriteSelectColumns } from './chunking/sql.js' -import { computeBackfillStateDir, computeEnvironmentFingerprint } from './state.js' +import { backfillPaths, computeBackfillStateDir, readPlan } from './state.js' function createMockQuery(opts: { - partitions?: Array<{ partition_id: string; total_rows: string; total_bytes: string; min_time: string; max_time: string }> + partitions?: Array<{ + partition_id: string + total_rows: string + total_bytes: string + total_uncompressed_bytes?: string + min_time: string + max_time: string + }> sortingKey?: string - sortKeyType?: string - sortKeyRanges?: Array<{ partition_id: string; min_val: string; max_val: string }> + columnRows?: Array<{ name: string; type: string }> } = {}): (sql: string) => Promise { const partitions = opts.partitions ?? [ - { partition_id: '202601', total_rows: '1000', total_bytes: '500000', min_time: '2026-01-01 00:00:00', max_time: '2026-01-01 18:00:00' }, + { + partition_id: '202601', + total_rows: '1000', + total_bytes: '500000', + total_uncompressed_bytes: '1000000', + min_time: '2026-01-01 00:00:00', + max_time: '2026-01-01 18:00:00', + }, ] const sortingKey = opts.sortingKey ?? 'event_time' - const sortKeyType = opts.sortKeyType ?? 'DateTime' - const sortKeyRanges = opts.sortKeyRanges ?? [] + const columnRows = opts.columnRows ?? [{ name: 'event_time', type: 'DateTime' }] return async (sql: string) => { - if (sql.includes('system.parts')) return partitions as T[] - if (sql.includes('system.tables')) return [{ sorting_key: sortingKey }] as T[] - if (sql.includes('system.columns')) return [{ type: sortKeyType }] as T[] - if (sql.includes('min(') && sql.includes('max(')) return sortKeyRanges as T[] + if (sql.includes('SELECT 1 FROM')) return [{ ok: 1 }] as T[] + if (sql.includes('FROM system.parts')) return partitions as T[] + if (sql.includes('FROM system.tables')) return [{ sorting_key: sortingKey }] as T[] + if (sql.includes('FROM system.columns')) return columnRows as T[] return [] as T[] } } describe('@chkit/plugin-backfill planning', () => { - test('each plan gets a unique random id', async () => { + test('each plan gets a unique random id and canonical chunk plan', async () => { const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) const configPath = join(dir, 'clickhouse.config.ts') @@ -42,12 +55,37 @@ describe('@chkit/plugin-backfill planning', () => { schema: './schema.ts', metaDir: './chkit/meta', }) - const opts = PlanSchema.parse({ target: 'app.events', from: '2026-01-01T00:00:00.000Z', to: '2026-01-01T18:00:00.000Z' }) + const opts = PlanSchema.parse({ + target: 'app.events', + from: '2026-01-01T00:00:00.000Z', + to: '2026-01-01T18:00:00.000Z', + }) const mockQuery = createMockQuery({ partitions: [ - { partition_id: '202601a', total_rows: '500', total_bytes: '250000', min_time: '2026-01-01 00:00:00', max_time: '2026-01-01 06:00:00' }, - { partition_id: '202601b', total_rows: '500', total_bytes: '250000', min_time: '2026-01-01 06:00:00', max_time: '2026-01-01 12:00:00' }, - { partition_id: '202601c', total_rows: '500', total_bytes: '250000', min_time: '2026-01-01 12:00:00', max_time: '2026-01-01 18:00:00' }, + { + partition_id: '202601a', + total_rows: '500', + total_bytes: '250000', + total_uncompressed_bytes: '500000', + min_time: '2026-01-01 00:00:00', + max_time: '2026-01-01 06:00:00', + }, + { + partition_id: '202601b', + total_rows: '500', + total_bytes: '250000', + total_uncompressed_bytes: '500000', + min_time: '2026-01-01 06:00:00', + max_time: '2026-01-01 12:00:00', + }, + { + partition_id: '202601c', + total_rows: '500', + total_bytes: '250000', + total_uncompressed_bytes: '500000', + min_time: '2026-01-01 12:00:00', + max_time: '2026-01-01 18:00:00', + }, ], }) @@ -56,12 +94,24 @@ describe('@chkit/plugin-backfill planning', () => { expect(first.plan.planId).not.toBe(second.plan.planId) expect(first.plan.planId).toMatch(/^[a-f0-9]{16}$/) - expect(first.plan.chunks).toHaveLength(3) - - const chunk = first.plan.chunks[0] - expect(chunk?.idempotencyToken.length).toBe(64) - expect(chunk?.sqlTemplate).toContain('INSERT INTO app.events') - expect(chunk?.sqlTemplate).toContain(`insert_deduplication_token='${chunk?.idempotencyToken}'`) + expect(first.plan.chunkPlan.chunks).toHaveLength(3) + + const chunk = first.plan.chunkPlan.chunks[0] + const token = chunk ? generateIdempotencyToken(first.plan.planId, chunk.id) : '' + const sql = chunk + ? buildChunkExecutionSql({ + planId: first.plan.planId, + chunk, + target: first.plan.target, + sourceTarget: first.plan.execution.sourceTarget, + table: first.plan.chunkPlan.table, + idempotencyToken: token, + }) + : '' + + expect(token).toHaveLength(64) + expect(sql).toContain('INSERT INTO app.events') + expect(sql).toContain(`insert_deduplication_token='${token}'`) } finally { await rm(dir, { recursive: true, force: true }) } @@ -76,22 +126,13 @@ describe('@chkit/plugin-backfill planning', () => { schema: './schema.ts', metaDir: './chkit/meta', }) - const opts = PlanSchema.parse({ target: 'app.events', from: '2026-01-01T00:00:00.000Z', to: '2026-01-01T07:00:00.000Z' }) - const mockQuery = createMockQuery({ - partitions: [ - { partition_id: '202601a', total_rows: '250', total_bytes: '125000', min_time: '2026-01-01 00:00:00', max_time: '2026-01-01 02:00:00' }, - { partition_id: '202601b', total_rows: '250', total_bytes: '125000', min_time: '2026-01-01 02:00:00', max_time: '2026-01-01 04:00:00' }, - { partition_id: '202601c', total_rows: '250', total_bytes: '125000', min_time: '2026-01-01 04:00:00', max_time: '2026-01-01 06:00:00' }, - { partition_id: '202601d', total_rows: '250', total_bytes: '125000', min_time: '2026-01-01 06:00:00', max_time: '2026-01-01 07:00:00' }, - ], - }) - - const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: mockQuery }) + const opts = PlanSchema.parse({ target: 'app.events' }) + const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: createMockQuery() }) const raw = await readFile(output.planPath, 'utf8') - const persisted = JSON.parse(raw) as { planId: string; chunks: Array<{ id: string }> } + const persisted = JSON.parse(raw) as { planId: string; chunkPlan: { chunks: Array<{ id: string }> } } expect(persisted.planId).toBe(output.plan.planId) - expect(persisted.chunks.length).toBe(4) + expect(persisted.chunkPlan.chunks.length).toBe(1) expect(output.planPath).toContain('/plans/') } finally { await rm(dir, { recursive: true, force: true }) @@ -108,45 +149,24 @@ describe('@chkit/plugin-backfill planning', () => { metaDir: './chkit/meta', }) const opts = PlanSchema.parse({ target: 'app.events' }) - const mockQuery = createMockQuery({ - sortingKey: 'session_date', - sortKeyType: 'Date', + const output = await buildBackfillPlan({ + opts, + configPath, + config, + clickhouseQuery: createMockQuery({ + sortingKey: 'session_date', + columnRows: [{ name: 'session_date', type: 'Date' }], + }), }) - const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: mockQuery }) - - expect(output.plan.sortKey?.column).toBe('session_date') - expect(output.plan.sortKey?.category).toBe('datetime') + expect(output.plan.chunkPlan.table.sortKeys[0]?.name).toBe('session_date') + expect(output.plan.chunkPlan.table.sortKeys[0]?.category).toBe('datetime') expect(output.plan.options.sortKeyColumn).toBe('session_date') } finally { await rm(dir, { recursive: true, force: true }) } }) - test('chunk IDs are deterministic within a plan (derived from planId)', async () => { - const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) - const configPath = join(dir, 'clickhouse.config.ts') - - try { - const config = resolveConfig({ - schema: './schema.ts', - metaDir: './chkit/meta', - }) - const opts = PlanSchema.parse({ target: 'app.events' }) - - const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: createMockQuery() }) - - const chunkIds = output.plan.chunks.map(c => c.id) - const uniqueIds = new Set(chunkIds) - expect(uniqueIds.size).toBe(chunkIds.length) - for (const id of chunkIds) { - expect(id).toMatch(/^[a-f0-9]{16}$/) - } - } finally { - await rm(dir, { recursive: true, force: true }) - } - }) - test('computes state dir from config by default and plugin override', () => { const config = resolveConfig({ schema: './schema.ts', @@ -161,7 +181,7 @@ describe('@chkit/plugin-backfill planning', () => { expect(overriddenDir).toBe(resolve('/tmp/project/custom-state')) }) - test('generates MV replay SQL when schema contains materialized view', async () => { + test('generates MV replay execution metadata and SQL when schema contains materialized view', async () => { const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) const configPath = join(dir, 'clickhouse.config.ts') const schemaPath = join(dir, 'schema.ts') @@ -196,348 +216,56 @@ export const events_mv = { metaDir: './chkit/meta', }) const opts = PlanSchema.parse({ target: 'app.events_agg' }) - const mockQuery = createMockQuery() - - const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: mockQuery }) - - expect(output.plan.strategy).toBe('mv_replay') - - const chunk = output.plan.chunks[0] - expect(chunk?.sqlTemplate).toContain('INSERT INTO app.events_agg') - expect(chunk?.sqlTemplate).not.toContain('WITH _backfill_source AS (') - expect(chunk?.sqlTemplate).toContain('SELECT toStartOfHour(event_time)') - expect(chunk?.sqlTemplate).toContain('FROM app.events') - expect(chunk?.sqlTemplate).toContain('GROUP BY event_time') - expect(chunk?.sqlTemplate).toContain('SETTINGS async_insert=0') - expect(chunk?.sqlTemplate).toContain(`insert_deduplication_token='${chunk?.idempotencyToken}'`) - expect(chunk?.sqlTemplate).not.toContain('FROM app.events_agg') - } finally { - await rm(dir, { recursive: true, force: true }) - } - }) - - test('MV replay rewrites SELECT columns to match target table order', async () => { - const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) - const configPath = join(dir, 'clickhouse.config.ts') - const schemaPath = join(dir, 'schema.ts') - - try { - await writeFile( - schemaPath, - `export const sessions = { - kind: 'table', - database: 'app', - name: 'session_analytics', - columns: [ - { name: 'session_date', type: 'Date' }, - { name: 'session_id', type: 'String' }, - { name: 'skills', type: 'Array(String)' }, - { name: 'slash_commands', type: 'Array(String)' }, - { name: 'ingested_at', type: 'DateTime' }, - ], - engine: 'MergeTree', - primaryKey: ['session_date'], - orderBy: ['session_date', 'session_id'], -} -export const sessions_mv = { - kind: 'materialized_view', - database: 'app', - name: 'sessions_mv', - to: { database: 'app', name: 'session_analytics' }, - as: "SELECT *, extractAll(content, 'skill') AS skills, extractAll(content, 'cmd') AS slash_commands FROM app.raw_sessions", -} -` - ) - - const config = resolveConfig({ - schema: './schema.ts', - metaDir: './chkit/meta', - }) - const opts = PlanSchema.parse({ target: 'app.session_analytics' }) - const mockQuery = createMockQuery() - - const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: mockQuery }) - - expect(output.plan.strategy).toBe('mv_replay') - - const chunk = output.plan.chunks[0] - expect(chunk?.sqlTemplate).toContain('INSERT INTO app.session_analytics') - expect(chunk?.sqlTemplate).not.toContain('INSERT INTO app.session_analytics (') - expect(chunk?.sqlTemplate).toContain( - "SELECT session_date, session_id, extractAll(content, 'skill') AS skills, extractAll(content, 'cmd') AS slash_commands, ingested_at" - ) - } finally { - await rm(dir, { recursive: true, force: true }) - } - }) - - test('omits insert_deduplication_token when requireIdempotencyToken is false', async () => { - const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) - const configPath = join(dir, 'clickhouse.config.ts') - - try { - const config = resolveConfig({ - schema: './schema.ts', - metaDir: './chkit/meta', - }) - const opts = PlanSchema.parse({ target: 'app.events', requireIdempotencyToken: false }) - const mockQuery = createMockQuery() - - const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: mockQuery }) - - const chunk = output.plan.chunks[0] - expect(chunk?.idempotencyToken).toBe('') - expect(chunk?.sqlTemplate).toContain('SETTINGS async_insert=0') - expect(chunk?.sqlTemplate).not.toContain('insert_deduplication_token') - } finally { - await rm(dir, { recursive: true, force: true }) - } - }) - - test('uses partition strategy when no MV is found', async () => { - const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) - const configPath = join(dir, 'clickhouse.config.ts') - - try { - const config = resolveConfig({ - schema: './schema.ts', - metaDir: './chkit/meta', - }) - const opts = PlanSchema.parse({ target: 'app.events' }) - const mockQuery = createMockQuery() - - const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: mockQuery }) - - expect(output.plan.strategy).toBe('partition') - - const chunk = output.plan.chunks[0] - expect(chunk?.sqlTemplate).toContain('INSERT INTO app.events') - expect(chunk?.sqlTemplate).toContain('FROM app.events') - expect(chunk?.sqlTemplate).toContain('_partition_id') - expect(chunk?.sqlTemplate).toContain('SETTINGS async_insert=0') - } finally { - await rm(dir, { recursive: true, force: true }) - } - }) - - test('throws when no partitions found', async () => { - const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) - const configPath = join(dir, 'clickhouse.config.ts') - - try { - const config = resolveConfig({ - schema: './schema.ts', - metaDir: './chkit/meta', - }) - const opts = PlanSchema.parse({ target: 'app.events' }) - const mockQuery = createMockQuery({ partitions: [] }) + const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: createMockQuery() }) - await expect( - buildBackfillPlan({ opts, configPath, config, clickhouseQuery: mockQuery }) - ).rejects.toThrow('No partitions found') + expect(output.plan.execution.mode).toBe('mv_replay') + + const chunk = output.plan.chunkPlan.chunks[0] + const sql = chunk + ? buildChunkExecutionSql({ + planId: output.plan.planId, + chunk, + target: output.plan.target, + sourceTarget: output.plan.execution.sourceTarget, + table: output.plan.chunkPlan.table, + mvAsQuery: output.plan.execution.mvAsQuery, + targetColumns: output.plan.execution.targetColumns, + idempotencyToken: generateIdempotencyToken(output.plan.planId, chunk.id), + }) + : '' + + expect(sql).toContain('INSERT INTO app.events_agg') + expect(sql).toContain('SELECT toStartOfHour(event_time)') + expect(sql).toContain('FROM app.events') + expect(sql).toContain('GROUP BY event_time') + expect(sql).toContain('SETTINGS async_insert=0') + expect(sql).not.toContain('FROM app.events_agg') } finally { await rm(dir, { recursive: true, force: true }) } }) -}) - -describe('rewriteSelectColumns', () => { - test('reorders SELECT columns to match target table order', () => { - const query = 'SELECT *, _foo as bar, _baz as qux FROM source WHERE status = 1' - const result = rewriteSelectColumns(query, ['col_a', 'bar', 'col_b', 'qux']) - - expect(result).toContain('SELECT col_a, _foo as bar, col_b, _baz as qux') - expect(result).toContain('FROM source') - expect(result).toContain('WHERE status = 1') - }) - - test('preserves WITH clause when rewriting SELECT', () => { - const query = [ - 'WITH', - " arrayDistinct(extractAll(content, '\\w+')) AS _skills,", - " toUInt64(JSONExtractFloat(meta, 'input')) AS _input_tokens", - 'SELECT *, _skills as skills, _input_tokens as input_tokens', - 'FROM app.sessions', - 'WHERE length(content) > 0', - ].join('\n') - - const result = rewriteSelectColumns(query, ['session_id', 'skills', 'content', 'input_tokens']) - - expect(result).toContain('arrayDistinct') - expect(result).toContain('_input_tokens') - expect(result).toContain('SELECT session_id, _skills as skills, content, _input_tokens as input_tokens') - expect(result).toContain('FROM app.sessions') - expect(result).toContain('WHERE length(content) > 0') - }) - test('handles SELECT without star expansion', () => { - const query = 'SELECT toStartOfHour(event_time) AS event_time, count() AS cnt FROM events GROUP BY event_time' - const result = rewriteSelectColumns(query, ['cnt', 'event_time']) + test('MV replay rewrites SELECT columns to match target table order', () => { + const rewritten = rewriteSelectColumns( + "SELECT *, extractAll(content, 'skill') AS skills, extractAll(content, 'cmd') AS slash_commands FROM app.raw_sessions", + ['session_date', 'session_id', 'skills', 'slash_commands', 'ingested_at'] + ) - expect(result).toContain('SELECT count() AS cnt, toStartOfHour(event_time) AS event_time') - expect(result).toContain('FROM events') - expect(result).toContain('GROUP BY event_time') + expect(rewritten).toContain('SELECT session_date, session_id, extractAll(content, \'skill\') AS skills, extractAll(content, \'cmd\') AS slash_commands, ingested_at') + expect(rewritten).toContain('FROM app.raw_sessions') }) - test('returns query unchanged when SELECT/FROM cannot be found', () => { - const query = 'INSERT INTO t VALUES (1, 2)' - const result = rewriteSelectColumns(query, ['a', 'b']) + test('MV replay preserves DISTINCT when rewriting projection columns', () => { + const rewritten = rewriteSelectColumns( + 'SELECT DISTINCT event_time AS ts, user_id AS uid FROM app.events', + ['uid', 'ts'] + ) - expect(result).toBe(query) + expect(rewritten).toContain('SELECT DISTINCT user_id AS uid, event_time AS ts') + expect(rewritten).toContain('FROM app.events') }) -}) - -describe('injectSortKeyFilter', () => { - const from = '2025-01-01T00:00:00.000Z' - const to = '2025-01-01T06:00:00.000Z' - - test('injects WHERE before GROUP BY for datetime filter', () => { - const query = 'SELECT toStartOfHour(event_time) AS event_time, count() AS count FROM app.events GROUP BY event_time' - const result = injectSortKeyFilter(query, 'event_time', 'datetime', from, to) - - expect(result).toContain("WHERE event_time >= parseDateTimeBestEffort('2025-01-01T00:00:00.000Z')") - expect(result).toContain("AND event_time < parseDateTimeBestEffort('2025-01-01T06:00:00.000Z')") - expect(result).toContain('GROUP BY event_time') - expect(result.indexOf('WHERE')).toBeLessThan(result.indexOf('GROUP BY')) - }) - - test('appends AND to existing WHERE clause', () => { - const query = 'SELECT * FROM app.events WHERE status = 1' - const result = injectSortKeyFilter(query, 'event_time', 'datetime', from, to) - - expect(result).toContain('WHERE status = 1') - expect(result).toContain("AND event_time >= parseDateTimeBestEffort('") - expect(result).toContain("AND event_time < parseDateTimeBestEffort('") - expect(result.match(/WHERE/g)?.length).toBe(1) - }) - - test('numeric sort key uses direct comparison', () => { - const query = 'SELECT * FROM app.events WHERE status = 1' - const result = injectSortKeyFilter(query, 'id', 'numeric', '100', '200') - - expect(result).toContain("AND id >= '100'") - expect(result).toContain("AND id < '200'") - expect(result).not.toContain('parseDateTimeBestEffort') - }) - - test('handles query with WHERE and QUALIFY', () => { - const query = [ - 'SELECT *, skills', - 'FROM app.sessions AS s', - 'WHERE length(timestamps) > 0', - "QUALIFY ROW_NUMBER() OVER (PARTITION BY s.id ORDER BY s.ts DESC) = 1", - ].join('\n') - const result = injectSortKeyFilter(query, 'session_date', 'datetime', from, to) - - expect(result).toContain('WHERE length(timestamps) > 0') - expect(result).toContain("AND session_date >= parseDateTimeBestEffort('") - expect(result.indexOf('AND session_date')).toBeLessThan(result.indexOf('QUALIFY')) - }) - - test('handles MV query with WITH column expressions', () => { - const query = [ - 'WITH', - " arrayDistinct(arrayFilter(x -> x != '', extractAll(content, '\\\\w+'))) AS _skills", - 'SELECT', - ' id,', - ' _skills as skills,', - ' ts', - 'FROM app.sessions', - 'WHERE length(content) > 0', - ].join('\n') - const result = injectSortKeyFilter(query, 'ts', 'datetime', from, to) - - expect(result.match(/WHERE/g)?.length).toBe(1) - expect(result).toContain("AND ts >= parseDateTimeBestEffort('") - expect(result).toContain('arrayDistinct') - }) - - test('injects WHERE at end when query has no WHERE and no trailing clauses', () => { - const query = 'SELECT * FROM app.events' - const result = injectSortKeyFilter(query, 'event_time', 'datetime', from, to) - - expect(result).toContain("WHERE event_time >= parseDateTimeBestEffort('") - expect(result).toContain("AND event_time < parseDateTimeBestEffort('") - }) - - test('ignores WHERE inside parenthesized subquery', () => { - const query = 'SELECT * FROM (SELECT * FROM app.events WHERE inner = 1) AS sub GROUP BY id' - const result = injectSortKeyFilter(query, 'ts', 'datetime', from, to) - - expect(result).toContain("WHERE ts >= parseDateTimeBestEffort('") - expect(result.indexOf("WHERE ts")).toBeLessThan(result.indexOf('GROUP BY')) - expect(result).toContain('WHERE inner = 1') - }) -}) - -describe('computeEnvironmentFingerprint', () => { - test('returns undefined when clickhouse is undefined', () => { - expect(computeEnvironmentFingerprint(undefined)).toBeUndefined() - }) - - test('returns correct structure with fingerprint, url origin, and database', () => { - const env = computeEnvironmentFingerprint({ - url: 'https://my-cluster.clickhouse.cloud:8443/some/path', - database: 'analytics', - }) - expect(env).toBeDefined() - expect(env?.fingerprint).toMatch(/^[a-f0-9]{16}$/) - expect(env?.url).toBe('https://my-cluster.clickhouse.cloud:8443') - expect(env?.database).toBe('analytics') - }) - - test('same URL+database produces same fingerprint', () => { - const a = computeEnvironmentFingerprint({ url: 'https://host:8443/path', database: 'db1' }) - const b = computeEnvironmentFingerprint({ url: 'https://host:8443/other', database: 'db1' }) - - expect(a?.fingerprint).toBe(b?.fingerprint) - }) - - test('different database produces different fingerprint', () => { - const a = computeEnvironmentFingerprint({ url: 'https://host:8443', database: 'staging' }) - const b = computeEnvironmentFingerprint({ url: 'https://host:8443', database: 'production' }) - - expect(a?.fingerprint).not.toBe(b?.fingerprint) - }) - - test('different host produces different fingerprint', () => { - const a = computeEnvironmentFingerprint({ url: 'https://staging.ch.cloud:8443', database: 'db' }) - const b = computeEnvironmentFingerprint({ url: 'https://prod.ch.cloud:8443', database: 'db' }) - - expect(a?.fingerprint).not.toBe(b?.fingerprint) - }) -}) - -describe('environment binding in plan', () => { - test('plan includes environment when clickhouse is provided', async () => { - const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) - const configPath = join(dir, 'clickhouse.config.ts') - - try { - const config = resolveConfig({ - schema: './schema.ts', - metaDir: './chkit/meta', - }) - - const output = await buildBackfillPlan({ - opts: PlanSchema.parse({ target: 'app.events' }), - configPath, - config, - clickhouse: { url: 'https://my-cluster.ch.cloud:8443', database: 'analytics' }, - clickhouseQuery: createMockQuery(), - }) - - expect(output.plan.environment).toBeDefined() - expect(output.plan.environment?.fingerprint).toMatch(/^[a-f0-9]{16}$/) - expect(output.plan.environment?.url).toBe('https://my-cluster.ch.cloud:8443') - expect(output.plan.environment?.database).toBe('analytics') - } finally { - await rm(dir, { recursive: true, force: true }) - } - }) - - test('plan omits environment when clickhouse connection info is not provided', async () => { + test('omits idempotency token when disabled', async () => { const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) const configPath = join(dir, 'clickhouse.config.ts') @@ -546,50 +274,57 @@ describe('environment binding in plan', () => { schema: './schema.ts', metaDir: './chkit/meta', }) + const opts = PlanSchema.parse({ target: 'app.events', requireIdempotencyToken: false }) + const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: createMockQuery() }) - const output = await buildBackfillPlan({ - opts: PlanSchema.parse({ target: 'app.events' }), - configPath, - config, - clickhouseQuery: createMockQuery(), - }) - - expect(output.plan.environment).toBeUndefined() + const chunk = output.plan.chunkPlan.chunks[0] + const sql = chunk + ? buildChunkExecutionSql({ + planId: output.plan.planId, + chunk, + target: output.plan.target, + sourceTarget: output.plan.execution.sourceTarget, + table: output.plan.chunkPlan.table, + idempotencyToken: '', + }) + : '' + + expect(output.plan.execution.requireIdempotencyToken).toBe(false) + expect(sql).toContain('SETTINGS async_insert=0') + expect(sql).not.toContain('insert_deduplication_token') } finally { await rm(dir, { recursive: true, force: true }) } }) - test('plan includes environment from different clickhouse configs', async () => { + test('rejects persisted legacy plans with an actionable error', async () => { const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) const configPath = join(dir, 'clickhouse.config.ts') + const planId = 'deadbeefdeadbeef' try { const config = resolveConfig({ schema: './schema.ts', metaDir: './chkit/meta', }) - const opts = PlanSchema.parse({ target: 'app.events' }) - - const staging = await buildBackfillPlan({ - opts, - configPath, - config, - clickhouse: { url: 'https://staging.ch.cloud:8443', database: 'analytics' }, - clickhouseQuery: createMockQuery(), - }) - - const production = await buildBackfillPlan({ - opts, + const stateDir = computeBackfillStateDir(config, configPath) + const { planPath } = backfillPaths(stateDir, planId) + await mkdir(dirname(planPath), { recursive: true }) + + await writeFile(planPath, JSON.stringify({ + planId, + target: 'app.events', + createdAt: '2026-01-01T00:00:00.000Z', + from: '2026-01-01T00:00:00.000Z', + to: '2026-01-01T01:00:00.000Z', + chunks: [], + })) + + await expect(readPlan({ + planId, configPath, config, - clickhouse: { url: 'https://prod.ch.cloud:8443', database: 'analytics' }, - clickhouseQuery: createMockQuery(), - }) - - expect(staging.plan.environment?.url).toBe('https://staging.ch.cloud:8443') - expect(production.plan.environment?.url).toBe('https://prod.ch.cloud:8443') - expect(staging.plan.environment?.fingerprint).not.toBe(production.plan.environment?.fingerprint) + })).rejects.toThrow('uses a previous chunking format') } finally { await rm(dir, { recursive: true, force: true }) } diff --git a/packages/plugin-backfill/src/planner.ts b/packages/plugin-backfill/src/planner.ts index 6f24e02..055c1b2 100644 --- a/packages/plugin-backfill/src/planner.ts +++ b/packages/plugin-backfill/src/planner.ts @@ -1,10 +1,10 @@ import { dirname } from 'node:path' -import { loadSchemaDefinitions } from '@chkit/core/schema-loader' import type { ResolvedChxConfig } from '@chkit/core' +import { loadSchemaDefinitions } from '@chkit/core/schema-loader' -import { analyzeAndChunk } from './chunking/analyze.js' -import { buildChunkSql } from './chunking/sql.js' +import { encodeChunkPlanForPersistence } from './chunking/boundary-codec.js' +import { generateChunkPlan } from './chunking/planner.js' import { findMvForTarget } from './detect.js' import { BackfillConfigError } from './errors.js' import type { PlanOptions } from './options.js' @@ -12,20 +12,18 @@ import { backfillPaths, computeBackfillStateDir, computeEnvironmentFingerprint, + nowIso, writeJson, } from './state.js' -import type { - BackfillChunk, - BuildBackfillPlanOutput, - PartitionInfo, -} from './types.js' +import type { BuildBackfillPlanOutput } from './types.js' export async function buildBackfillPlan(input: { opts: PlanOptions configPath: string config: Pick clickhouse?: { url: string; database: string } - clickhouseQuery: (sql: string) => Promise + clickhouseQuery: (sql: string, settings?: Record) => Promise + querySettings?: Record }): Promise { const { opts } = input const [database, table] = opts.target.split('.') @@ -33,33 +31,36 @@ export async function buildBackfillPlan(input: { throw new BackfillConfigError('Invalid target format. Expected .') } - const env = computeEnvironmentFingerprint(input.clickhouse) - - // 1. Analyze table and build planned chunks - const { planId, partitions, sortKey, chunks: plannedChunks } = await analyzeAndChunk({ + const chunkPlan = await generateChunkPlan({ database, table, from: opts.from, to: opts.to, - maxChunkBytes: opts.maxChunkBytes, - requireIdempotencyToken: opts.requireIdempotencyToken, + targetChunkBytes: opts.maxChunkBytes, query: input.clickhouseQuery, + querySettings: input.querySettings, }) - if (partitions.length === 0) { + const firstPartition = chunkPlan.partitions[0] + if (!firstPartition) { throw new BackfillConfigError( `No partitions found for ${opts.target}${opts.from || opts.to ? ' within the specified time range' : ''}. The table may be empty.` ) } - const firstPartition = partitions[0] as PartitionInfo - const derivedFrom = opts.from ?? partitions.reduce((min, p) => (p.minTime < min ? p.minTime : min), firstPartition.minTime) - const derivedTo = opts.to ?? partitions.reduce((max, p) => (p.maxTime > max ? p.maxTime : max), firstPartition.maxTime) + const env = computeEnvironmentFingerprint(input.clickhouse) + const derivedFrom = opts.from ?? chunkPlan.partitions.reduce( + (min, partition) => (partition.minTime < min ? partition.minTime : min), + firstPartition.minTime + ) + const derivedTo = opts.to ?? chunkPlan.partitions.reduce( + (max, partition) => (partition.maxTime > max ? partition.maxTime : max), + firstPartition.maxTime + ) const stateDir = computeBackfillStateDir(input.config, input.configPath, opts.stateDir) - const paths = backfillPaths(stateDir, planId) + const paths = backfillPaths(stateDir, chunkPlan.planId) - // 2. Detect MV for replay strategy let mvAsQuery: string | undefined let targetColumns: string[] | undefined @@ -71,62 +72,37 @@ export async function buildBackfillPlan(input: { if (mv) { mvAsQuery = mv.as const tableDef = definitions.find( - (d) => d.kind === 'table' && d.database === database && d.name === table + (definition) => definition.kind === 'table' && definition.database === database && definition.name === table ) - if (tableDef && tableDef.kind === 'table') { - targetColumns = tableDef.columns.map((c) => c.name) + if (tableDef?.kind === 'table') { + targetColumns = tableDef.columns.map((column) => column.name) } } } catch { - // Schema load failed — fall back to direct copy + // Schema load failed, fall back to direct copy. } - // 3. Stamp SQL on each planned chunk to produce BackfillChunk[] - const chunks: BackfillChunk[] = plannedChunks.map(planned => { - const sqlTemplate = buildChunkSql({ - planId, - chunk: planned, - target: opts.target, - sortKey, - mvAsQuery, - targetColumns, - }) - - return { - id: planned.id, - from: planned.from, - to: planned.to, - status: 'pending' as const, - attempts: 0, - idempotencyToken: planned.idempotencyToken, - sqlTemplate, - partitionId: planned.partitionId, - estimatedBytes: planned.estimatedBytes, - ...(planned.sortKeyFrom !== undefined ? { sortKeyFrom: planned.sortKeyFrom } : {}), - ...(planned.sortKeyTo !== undefined ? { sortKeyTo: planned.sortKeyTo } : {}), - } - }) - - const strategy = mvAsQuery ? 'mv_replay' : 'partition' - const plan = { - planId, + planId: chunkPlan.planId, target: opts.target, - createdAt: '1970-01-01T00:00:00.000Z', - status: 'planned' as const, - strategy: strategy as 'partition' | 'mv_replay', + createdAt: nowIso(), ...(env ? { environment: env } : {}), from: derivedFrom, to: derivedTo, - chunks, - partitions, - sortKey, + chunkPlan, + execution: { + mode: mvAsQuery ? 'mv_replay' as const : 'copy' as const, + sourceTarget: opts.target, + ...(mvAsQuery ? { mvAsQuery } : {}), + ...(targetColumns ? { targetColumns } : {}), + requireIdempotencyToken: opts.requireIdempotencyToken, + }, options: { maxChunkBytes: opts.maxChunkBytes, maxParallelChunks: opts.maxParallelChunks, maxRetriesPerChunk: opts.maxRetriesPerChunk, requireIdempotencyToken: opts.requireIdempotencyToken, - sortKeyColumn: sortKey?.column, + sortKeyColumn: chunkPlan.table.sortKeys[0]?.name, }, policy: { requireDryRunBeforeRun: opts.requireDryRunBeforeRun, @@ -140,7 +116,10 @@ export async function buildBackfillPlan(input: { }, } - await writeJson(paths.planPath, plan) + await writeJson(paths.planPath, { + ...plan, + chunkPlan: encodeChunkPlanForPersistence(plan.chunkPlan), + }) return { plan, diff --git a/packages/plugin-backfill/src/plugin.test.ts b/packages/plugin-backfill/src/plugin.test.ts index 275fe78..b84c98b 100644 --- a/packages/plugin-backfill/src/plugin.test.ts +++ b/packages/plugin-backfill/src/plugin.test.ts @@ -1,7 +1,16 @@ import { describe, expect, test } from 'bun:test' +import { readFileSync } from 'node:fs' +import * as sdk from './sdk.js' +import * as root from './index.js' import { backfill, createBackfillPlugin } from './plugin.js' +const pluginBackfillPackage = JSON.parse( + readFileSync(new URL('../package.json', import.meta.url), 'utf8') +) as { + exports: Record +} + describe('@chkit/plugin-backfill plugin surface', () => { test('exposes commands and typed registration helper', () => { const plugin = createBackfillPlugin() @@ -21,4 +30,34 @@ describe('@chkit/plugin-backfill plugin surface', () => { expect(registration.enabled).toBe(true) expect(registration.options?.maxParallelChunks).toBe(4) }) + + test('keeps internals off the package root and exposes them via sdk', () => { + expect(root).not.toHaveProperty('analyzeAndChunk') + expect(root).not.toHaveProperty('executeBackfill') + + expect(sdk).toHaveProperty('analyzeAndChunk') + expect(sdk).toHaveProperty('configureSync') + expect(sdk).toHaveProperty('generateChunkPlan') + expect(sdk).toHaveProperty('getBackfillLogger') + expect(sdk).toHaveProperty('getConsoleSink') + expect(sdk).toHaveProperty('executeBackfill') + expect(sdk).toHaveProperty('buildChunkExecutionSql') + expect(sdk).toHaveProperty('buildWhereClauseFromChunk') + expect(sdk).toHaveProperty('encodeChunkPlanForPersistence') + expect(sdk).toHaveProperty('decodeChunkPlanFromPersistence') + expect(sdk).toHaveProperty('generateIdempotencyToken') + }) + + test('package exports declare root and sdk subpath separately', () => { + expect(pluginBackfillPackage.exports['.']).toEqual({ + source: './src/index.ts', + types: './dist/index.d.ts', + default: './dist/index.js', + }) + expect(pluginBackfillPackage.exports['./sdk']).toEqual({ + source: './src/sdk.ts', + types: './dist/sdk.d.ts', + default: './dist/sdk.js', + }) + }) }) diff --git a/packages/plugin-backfill/src/plugin.ts b/packages/plugin-backfill/src/plugin.ts index 53079d8..449c58c 100644 --- a/packages/plugin-backfill/src/plugin.ts +++ b/packages/plugin-backfill/src/plugin.ts @@ -2,6 +2,8 @@ import { createClickHouseExecutor } from '@chkit/clickhouse' import { wrapPluginRun } from '@chkit/core' import { executeBackfill, type BackfillProgress } from './async-backfill.js' +import { buildChunkExecutionSql } from './chunking/sql.js' +import { generateIdempotencyToken } from './chunking/utils/ids.js' import { BackfillConfigError } from './errors.js' import { PLAN_FLAGS, @@ -112,11 +114,22 @@ async function runBackfill(input: { const result = await executeBackfill({ executor: db, planId: plan.planId, - chunks: plan.chunks.map((c) => ({ id: c.id, from: c.from, to: c.to })), + chunks: plan.chunkPlan.chunks.map((chunk) => ({ id: chunk.id })), buildQuery: (chunk) => { - const planChunk = plan.chunks.find((c) => c.id === chunk.id) + const planChunk = plan.chunkPlan.chunks.find((candidate) => candidate.id === chunk.id) if (!planChunk) throw new Error(`Chunk ${chunk.id} not found in plan`) - return planChunk.sqlTemplate + return buildChunkExecutionSql({ + planId: plan.planId, + chunk: planChunk, + target: plan.target, + sourceTarget: plan.execution.sourceTarget, + table: plan.chunkPlan.table, + mvAsQuery: plan.execution.mvAsQuery, + targetColumns: plan.execution.targetColumns, + idempotencyToken: plan.execution.requireIdempotencyToken + ? generateIdempotencyToken(plan.planId, planChunk.id) + : '', + }) }, concurrency: input.concurrency, pollIntervalMs: input.pollIntervalMs, @@ -205,22 +218,25 @@ export function createBackfillPlugin(options: PluginConfig = {}): BackfillPlugin configPath: context.configPath, config: context.config, clickhouse: context.config.clickhouse, - clickhouseQuery: async (sql: string) => { - const result = await db.query(sql) + clickhouseQuery: async (sql: string, settings?: Record) => { + const result = await db.query(sql, settings) return result as T[] }, + // ObsessionDB (ClickHouse Cloud) enables parallel replicas by default, + // which inflates aggregate results (count, GROUP BY). Disable for planning + // queries until ObsessionDB handles it at the profile level. + querySettings: { enable_parallel_replicas: 0 }, }) const payload = planPayload(output) if (context.jsonMode) { context.print(payload) } else { - const partitionCount = output.plan.partitions?.length ?? 0 - const totalBytes = output.plan.partitions - ? formatBytes(output.plan.partitions.reduce((sum, p) => sum + p.bytesOnDisk, 0)) - : 'unknown' - const sortKeyLabel = output.plan.sortKey - ? `, sort key: ${output.plan.sortKey.column} (${output.plan.sortKey.category})` + const partitionCount = output.plan.chunkPlan.partitions.length + const totalBytes = formatBytes(output.plan.chunkPlan.totalBytesCompressed) + const primarySortKey = output.plan.chunkPlan.table.sortKeys[0] + const sortKeyLabel = primarySortKey + ? `, sort key: ${primarySortKey.name} (${primarySortKey.category})` : '' context.print( `Backfill plan ${payload.planId} for ${payload.target} (${payload.chunkCount} chunks across ${partitionCount} partitions, ~${totalBytes}${sortKeyLabel}) -> ${payload.planPath}` diff --git a/packages/plugin-backfill/src/queries.ts b/packages/plugin-backfill/src/queries.ts index 66780ef..ffdef75 100644 --- a/packages/plugin-backfill/src/queries.ts +++ b/packages/plugin-backfill/src/queries.ts @@ -35,8 +35,8 @@ export async function getBackfillStatus(input: { target: plan.target, status: 'planned', totals: { - total: plan.chunks.length, - pending: plan.chunks.length, + total: plan.chunkPlan.chunks.length, + pending: plan.chunkPlan.chunks.length, submitted: 0, running: 0, done: 0, @@ -108,7 +108,7 @@ export async function getBackfillDoctorReport(input: { planId: plan.planId, target: plan.target, status: 'planned' as const, - totals: { total: plan.chunks.length, pending: plan.chunks.length, submitted: 0, running: 0, done: 0, failed: 0 }, + totals: { total: plan.chunkPlan.chunks.length, pending: plan.chunkPlan.chunks.length, submitted: 0, running: 0, done: 0, failed: 0 }, rowsWritten: 0, updatedAt: plan.createdAt, runPath: paths.runPath, diff --git a/packages/plugin-backfill/src/sdk.ts b/packages/plugin-backfill/src/sdk.ts new file mode 100644 index 0000000..99dcbf8 --- /dev/null +++ b/packages/plugin-backfill/src/sdk.ts @@ -0,0 +1,48 @@ +export * from '@logtape/logtape' + +export { executeBackfill, syncProgress } from './async-backfill.js' +export { analyzeAndChunk, analyzeTable } from './chunking/analyze.js' +export { + decodeChunkPlanFromPersistence, + encodeChunkPlanForPersistence, +} from './chunking/boundary-codec.js' +export { generateChunkPlan } from './chunking/planner.js' +export { + CHKIT_BACKFILL_LOGGER_CATEGORY, + CHKIT_LOGGER_CATEGORY, + getBackfillLogger, +} from './logging.js' +export { + buildChunkExecutionSql, + buildWhereClauseFromChunk, + injectSortKeyFilter, + rewriteSelectColumns, +} from './chunking/sql.js' +export { generateIdempotencyToken } from './chunking/utils/ids.js' + +export type { + BackfillOptions, + BackfillChunkState, + BackfillProgress, + BackfillResult, +} from './async-backfill.js' + +export type { + AnalyzeAndChunkInput, + AnalyzeAndChunkResult, + AnalyzeTableInput, + AnalyzeTableResult, +} from './chunking/analyze.js' + +export type { + Chunk, + ChunkDerivationStep, + ChunkPlan, + ChunkRange, + EstimateConfidence, + EstimateReason, + FocusedValue, + Partition, + PartitionDiagnostics, + SortKey, +} from './chunking/types.js' diff --git a/packages/plugin-backfill/src/state.ts b/packages/plugin-backfill/src/state.ts index 45dd900..2b47d52 100644 --- a/packages/plugin-backfill/src/state.ts +++ b/packages/plugin-backfill/src/state.ts @@ -5,6 +5,7 @@ import { dirname, join, resolve } from 'node:path' import type { ResolvedChxConfig } from '@chkit/core' +import { decodeChunkPlanFromPersistence } from './chunking/boundary-codec.js' import { BackfillConfigError } from './errors.js' import type { BackfillEnvironment, @@ -89,6 +90,13 @@ async function readJsonMaybe(filePath: string): Promise { return JSON.parse(await readFile(filePath, 'utf8')) as T } +function decodePlan(plan: BackfillPlanState): BackfillPlanState { + return { + ...plan, + chunkPlan: decodeChunkPlanFromPersistence(plan.chunkPlan), + } +} + export async function writeJson(filePath: string, value: unknown): Promise { await mkdir(dirname(filePath), { recursive: true }) await writeFile(filePath, `${JSON.stringify(value, null, 2)}\n`, 'utf8') @@ -102,12 +110,21 @@ export async function readPlan(input: { }): Promise { const stateDir = computeBackfillStateDir(input.config, input.configPath, input.stateDir) const paths = backfillPaths(stateDir, input.planId) - const plan = await readJsonMaybe(paths.planPath) - if (!plan) { + const rawPlan = await readJsonMaybe>(paths.planPath) + if (!rawPlan) { throw new BackfillConfigError(`Backfill plan not found: ${paths.planPath}`) } + + if (!('chunkPlan' in rawPlan)) { + throw new BackfillConfigError( + `Backfill plan ${input.planId} uses a previous chunking format and can no longer be loaded. Recreate the plan.` + ) + } + + const plan = rawPlan as unknown as BackfillPlanState + return { - plan, + plan: decodePlan(plan), planPath: paths.planPath, stateDir, } @@ -132,7 +149,7 @@ export function summarizeRunStatus( plan: BackfillPlanState, ): BackfillStatusSummary { const totals = { - total: plan.chunks.length, + total: plan.chunkPlan.chunks.length, pending: 0, submitted: 0, running: 0, @@ -141,7 +158,7 @@ export function summarizeRunStatus( } let rowsWritten = 0 - for (const chunk of plan.chunks) { + for (const chunk of plan.chunkPlan.chunks) { const state = run.progress[chunk.id] if (!state) { totals.pending += 1 diff --git a/packages/plugin-backfill/src/types.ts b/packages/plugin-backfill/src/types.ts index f3b50da..4f02865 100644 --- a/packages/plugin-backfill/src/types.ts +++ b/packages/plugin-backfill/src/types.ts @@ -1,7 +1,9 @@ import type { ChxInlinePluginRegistration, ResolvedChxConfig } from '@chkit/core' import type { BackfillProgress } from './async-backfill.js' -import type { PartitionInfo, SortKeyInfo } from './chunking/types.js' +import type { + ChunkPlan, +} from './chunking/types.js' import type { PluginConfig } from './options.js' /** @deprecated Use {@link PluginConfig} instead. */ @@ -16,42 +18,41 @@ export interface BackfillEnvironment { export type BackfillPlanStatus = 'planned' | 'running' | 'paused' | 'completed' | 'failed' | 'cancelled' -export type { ChunkBoundary, PartitionInfo, PlannedChunk, SortKeyInfo } from './chunking/types.js' - -export interface BackfillChunk { - id: string - from: string - to: string - status: 'pending' | 'running' | 'done' | 'failed' | 'skipped' - attempts: number - idempotencyToken: string - sqlTemplate: string - lastError?: string - partitionId: string - estimatedBytes: number - sortKeyFrom?: string - sortKeyTo?: string +export type { + Chunk, + ChunkDerivationStep, + ChunkPlan, + ChunkRange, + EstimateConfidence, + EstimateReason, + FocusedValue, + Partition, + PartitionDiagnostics, + SortKey, +} from './chunking/types.js' + +export interface BackfillExecutionPlan { + mode: 'copy' | 'mv_replay' + sourceTarget: string + mvAsQuery?: string + targetColumns?: string[] + requireIdempotencyToken: boolean } export interface BackfillPlanState { planId: string target: string createdAt: string - status: BackfillPlanStatus - strategy?: 'table' | 'mv_replay' | 'partition' environment?: BackfillEnvironment from: string to: string - chunks: BackfillChunk[] - partitions?: PartitionInfo[] - sortKey?: SortKeyInfo + chunkPlan: ChunkPlan + execution: BackfillExecutionPlan options: { - chunkHours?: number maxChunkBytes?: number maxParallelChunks: number maxRetriesPerChunk: number requireIdempotencyToken: boolean - timeColumn?: string sortKeyColumn?: string } policy: { diff --git a/packages/plugin-obsessiondb/package.json b/packages/plugin-obsessiondb/package.json index 3e4eee5..01b459a 100644 --- a/packages/plugin-obsessiondb/package.json +++ b/packages/plugin-obsessiondb/package.json @@ -41,6 +41,7 @@ "clean": "rm -rf dist" }, "dependencies": { + "@chkit/clickhouse": "workspace:*", "@chkit/core": "workspace:*", "@orpc/client": "1.13.4", "@orpc/contract": "1.13.4", diff --git a/packages/plugin-obsessiondb/src/index.ts b/packages/plugin-obsessiondb/src/index.ts index 77efd28..d17c6bd 100644 --- a/packages/plugin-obsessiondb/src/index.ts +++ b/packages/plugin-obsessiondb/src/index.ts @@ -12,6 +12,10 @@ import { loadSelectedService } from './service/storage.js' export { loadCredentials, resolveBaseUrl, type Credentials } from './auth/index.js' export { createJobsClient, type JobsClient } from './backfill/index.js' +export { + loadSelectedService, +} from './service/storage.js' +export type { SelectedService } from './service/types.js' export type ObsessionDBPluginOptions = Record diff --git a/thoughts/smart-chunking-e2e-scenarios.md b/thoughts/smart-chunking-e2e-scenarios.md new file mode 100644 index 0000000..d165593 --- /dev/null +++ b/thoughts/smart-chunking-e2e-scenarios.md @@ -0,0 +1,194 @@ +# Smart Chunking E2E Test Scenarios + +Remaining scenarios to implement. Each gets its own table in the seed script and a `describe` block in `smart-chunking.e2e.test.ts`. + +Implemented so far: +- [x] Scenario 1: Skewed Power Law (80/20 single hot key) +- [x] Scenario 2: Multiple Hot Keys (3 tenants at ~30% each) + +--- + +## Scenario 3: Empty Ranges / Sparse Numeric Sort Key + +**Table:** `chkit_e2e_chunking_sparse_numeric` +**Sort key:** `(id UInt64)` +**Partition by:** `toYYYYMM(event_time)` + +**Dataset:** +- ~5,000 rows with `id` in range `[1, 10]` +- ~5,000 rows with `id` in range `[1_000_000, 1_000_010]` +- No values between 10 and 1,000,000 +- Padding column for byte control + +**What this tests:** +- Equal-width splitting will carve the huge numeric gap into many empty intervals +- Quantile binary search must handle the gap without producing empty chunks +- The system should not emit chunks with 0 rows +- After merge, only chunks covering the two clusters should remain +- Full row coverage despite the sparse distribution + +**Key assertions:** +- No chunk has 0 estimated rows +- All chunks produced have `estimate.rows > 0` +- Total counted rows = total actual rows +- Chunk count is reasonable (not dozens of empty chunks) + +--- + +## Scenario 4: Single Distinct Value in Sort Key + +**Table:** `chkit_e2e_chunking_single_value` +**Sort key:** `(status String, seq UInt64)` +**Partition by:** `toYYYYMM(event_time)` + +**Dataset:** +- 10,000 rows all with `status = 'active'`, `seq` 0-9999 +- Single partition, padding for byte volume + +**What this tests:** +- Every splitting strategy on dimension 0 should fail (quantile boundaries collapse, equal-width produces identical bounds, group-by-key returns 1 value) +- The system must fall through to dimension 1 (seq) and split there +- Or: produce a single chunk if seq splitting isn't needed +- Must not infinite-loop or error when no split is possible on dim 0 + +**Key assertions:** +- Plan completes without error +- If partition is oversized: chunks are split on dim 1 (seq), not dim 0 +- Total counted rows = total actual rows +- No duplicate coverage + +--- + +## Scenario 5: Very Long String Keys with Shared Prefixes + +**Table:** `chkit_e2e_chunking_long_prefix` +**Sort key:** `(url String)` +**Partition by:** `toYYYYMM(event_time)` + +**Dataset:** +- 10,000 rows where `url` follows pattern: `https://example.com/api/v2/resources/XXXX` + where `XXXX` is a 4-digit incrementing ID (0000-9999) +- All values share a 39-character prefix; differ only in the last 4 characters +- Single partition + +**What this tests:** +- `string-prefix-split` at depths 1-4 will see a single bucket (prefix is 39 chars) +- The system must fall through to quantile or equal-width splitting +- The dynamic BigInt width (from our fix) must handle 40+ char strings correctly +- Boundary computation must have enough precision in the suffix to split evenly + +**Key assertions:** +- Plan completes, produces multiple chunks +- Chunks have boundaries that differentiate in the suffix portion +- Full row coverage +- No chunks with 0 rows (the long shared prefix shouldn't confuse the splitter) + +--- + +## Scenario 6: DateTime Sort Key with Burst Traffic + +**Table:** `chkit_e2e_chunking_datetime_burst` +**Sort key:** `(event_time DateTime)` +**Partition by:** `toYYYYMM(event_time)` + +**Dataset:** +- 500 rows spread across 30 days of January 2026 (background traffic) +- 9,500 rows all within a single hour: `2026-01-15 14:00:00` to `2026-01-15 14:59:59` +- Single partition, padding for byte volume + +**What this tests:** +- Day-level temporal bucketing produces one massive day and many tiny ones +- Hour-level fallback kicks in for Jan 15 +- If 95% is within one hour, even hour-level bucketing can't split further +- Must fall through to quantile splitting on the datetime dimension itself +- Tests the full temporal cascade: day -> hour -> quantile + +**Key assertions:** +- Plan completes, produces multiple chunks +- The burst hour is split into multiple chunks (not left as one oversized chunk) +- Background traffic days are merged into larger chunks (not 30 tiny chunks) +- Full row coverage +- Reasonable chunk sizes (within 2-3x target) + +--- + +## Scenario 7: Three-Dimension Compound Key + +**Table:** `chkit_e2e_chunking_three_dim` +**Sort key:** `(region String, tenant_id String, event_time DateTime)` +**Partition by:** `toYYYYMM(event_time)` + +**Dataset:** +- 5 regions: `us-east`, `us-west`, `eu-west`, `ap-south`, `ap-east` +- Per region: 1 hot tenant with 1,500 rows + 10 small tenants with 10 rows each +- Hot tenant rows spread across 7 days in January 2026 +- Total: 5 * (1500 + 100) = 8,000 rows + +**What this tests:** +- Recursion through 3 dimensions (max depth = 3 * 3 = 9) +- Dimension 0 (region) splits into ~5 sub-ranges +- Dimension 1 (tenant_id) identifies hot tenant per region +- Dimension 2 (event_time) splits hot tenants by time +- Final chunks should carry ranges on all three dimensions + +**Key assertions:** +- Plan completes within timeout +- Hot tenants are detected as focused values +- Some chunks have ranges on all 3 dimensions +- Full row coverage +- Chunk count is reasonable (not exponential blowup) + +--- + +## Scenario 8: Partition at Exact Fuzz Factor Boundary + +**Table:** `chkit_e2e_chunking_fuzz_boundary` +**Sort key:** `(id UInt64)` +**Partition by:** `toYYYYMM(event_time)` + +**Dataset:** +- Two partitions (January and February 2026) +- January: rows sized to be exactly at `targetChunkBytes * 1.0` +- February: rows sized to be exactly at `targetChunkBytes * 1.6` (above the 1.5x fuzz factor) +- Controlled via row count and padding size + +**What this tests:** +- The stop condition `<= target * 1.5` +- January partition (1.0x) should produce exactly 1 chunk +- February partition (1.6x) should be split into 2+ chunks +- Boundary arithmetic of the fuzz factor + +**Key assertions:** +- January partition: exactly 1 chunk +- February partition: 2+ chunks +- Full row coverage in both partitions + +**Implementation note:** This requires querying `system.parts` after seeding to learn the actual uncompressed bytes, then computing the target from the smaller partition's size. The seed might need iterative adjustment to hit the right byte ratio. + +--- + +## Scenario 9: Mixed Type Sort Keys (Numeric + String) + +**Table:** `chkit_e2e_chunking_mixed_types` +**Sort key:** `(priority UInt8, slug String)` +**Partition by:** `toYYYYMM(event_time)` + +**Dataset:** +- `priority` has 3 distinct values: 1, 2, 3 +- Priority 1: 1,000 rows with 100 distinct slugs +- Priority 2: 6,000 rows with 50 distinct slugs (hot priority) +- Priority 3: 3,000 rows with 200 distinct slugs +- Slugs are short strings like `item-XXXX` + +**What this tests:** +- Numeric dimension with very low cardinality (3 values) +- Quantile splitting will likely collapse on dim 0 (only 3 values) +- Equal-width on dim 0 should produce 3 intervals matching the 3 values +- Oversized priority-2 bucket must then split on dim 1 (slug) +- Tests cross-type dimension interaction + +**Key assertions:** +- All three priorities are represented in chunks +- Priority 2 chunks are split on the slug dimension +- Full row coverage +- No chunks span multiple priority values (each chunk's dim 0 range should be tight)