From 4ef37bdf0bdfae204ac529f323c3f29f94a78772 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 6 May 2026 12:50:50 -0700 Subject: [PATCH 1/4] Default plugins.calcite.enabled=true on the unified query path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The unified query handler (`RestUnifiedQueryAction` → `TransportPPLQueryAction` → analytics-engine) builds its `UnifiedQueryContext` with an empty `Settings` map; nothing wires the cluster setting through. PPL parsing is delegated to the same `AstBuilder` the v2 path uses, and that builder gates `table` (and other Calcite-only commands) on `Settings.Key.CALCITE_ENGINE_ENABLED`. With no setting propagated, the gate sees `null`, fails the `Boolean.TRUE.equals` check, and throws UnsupportedOperationException: Table command is supported only when plugins.calcite.enabled=true even when the cluster setting is true, blocking every `table` query routed through the analytics path under `tests.analytics.force_routing=true`. The unified path is by definition Calcite-based — every query reaching `UnifiedQueryContext` flows through Calcite's planner. Default `CALCITE_ENGINE_ENABLED=true` in `buildSettings()` when the underlying map doesn't have the key. This unblocks `table` and any other AstBuilder gate that defends the same toggle, without changing v2 behavior (v2 constructs `AstBuilder` with cluster `Settings`, not the unified context). Also refresh the PPL analytics-engine routing dev doc to document the unified-context dependency and switch the per-command verification recipe from the bundle-branch `analyticsCompatibilityTest` task to the standard `integTestRemote -Dtests.analytics.{force_routing,parquet_indices}=true` — the bundle-branch task is purely for the daily coverage-report sweep. Signed-off-by: Kai Huang --- .../sql/api/UnifiedQueryContext.java | 8 + docs/dev/ppl-analytics-engine-coverage-sop.md | 152 +++++++ docs/dev/ppl-analytics-engine-routing.md | 426 ++++++++++++++++++ 3 files changed, 586 insertions(+) create mode 100644 docs/dev/ppl-analytics-engine-coverage-sop.md create mode 100644 docs/dev/ppl-analytics-engine-routing.md diff --git a/api/src/main/java/org/opensearch/sql/api/UnifiedQueryContext.java b/api/src/main/java/org/opensearch/sql/api/UnifiedQueryContext.java index 4332ff17660..864b2610897 100644 --- a/api/src/main/java/org/opensearch/sql/api/UnifiedQueryContext.java +++ b/api/src/main/java/org/opensearch/sql/api/UnifiedQueryContext.java @@ -225,6 +225,14 @@ private Settings buildSettings() { @Override @SuppressWarnings("unchecked") public T getSettingValue(Key key) { + // The unified query path is always Calcite-based — every query reaching this context + // goes through Calcite's planner, never the v2 engine. Default + // {@code plugins.calcite.enabled} to true so AstBuilder gates (e.g. visitTableCommand) + // that protect v2-only code paths don't block valid analytics-engine queries when the + // underlying setting hasn't been propagated through Builder#setting. + if (key == Key.CALCITE_ENGINE_ENABLED && !settings.containsKey(key)) { + return (T) Boolean.TRUE; + } return (T) settings.get(key); } diff --git a/docs/dev/ppl-analytics-engine-coverage-sop.md b/docs/dev/ppl-analytics-engine-coverage-sop.md new file mode 100644 index 00000000000..710feb5f724 --- /dev/null +++ b/docs/dev/ppl-analytics-engine-coverage-sop.md @@ -0,0 +1,152 @@ +# SOP — PPL Analytics-Engine IT Coverage Report + +Run all PPL ITs through the analytics-engine path against an externally-managed +cluster, get a bucketed markdown report. Daily. + +Output: `integ-test/build/reports/analytics-compatibility/REPORT.md` + +--- + +## One-time setup + +JDK 25 + JDK 21, Rust stable, ~15 GB free disk. + +```bash +mkdir -p ~/workspace && cd ~/workspace +git clone git@github.com:opensearch-project/OpenSearch.git +git clone git@github.com:opensearch-project/sql.git +( cd OpenSearch && git remote add upstream https://github.com/opensearch-project/OpenSearch.git ) +( cd sql && git remote add ahkcs git@github.com:ahkcs/sql.git ) +``` + +**OpenSearch — track `upstream/main` + apply commons-text patch:** + +`upstream/main` already has the full analytics-engine sandbox plugin set +(`analytics-engine`, `analytics-backend-{datafusion,lucene}`, `composite-engine`, +`dsl-query-executor`, `parquet-data-format`) and the `dataformat-native` rust +crate. Track main; rebase daily. + +```bash +cd ~/workspace/OpenSearch +git fetch upstream main && git checkout -B main upstream/main +``` + +In `sandbox/plugins/analytics-engine/build.gradle`, add next to the existing +`commons-math3` runtimeOnly: + +```groovy +runtimeOnly "org.apache.commons:commons-text:1.11.0" +``` + +`commons-text` is only listed under `resolutionStrategy.force` on main, which +pins the version but doesn't bundle the jar — without this `runtimeOnly` line +the cluster dies on `LevenshteinDistance` the first time a query reaches +Calcite's `SqlFunctions.`. Track upstreaming; once it lands, drop this +patch. + +**Build the native lib (one-time + after rust changes):** + +The cluster JVM loads `libopensearch_native.dylib` (rust + JNI/FFM) at startup +for DataFusion execution, parquet I/O, and native repository ops. Without it +the cluster crashes in `NativeBridge.`. + +```bash +cd ~/workspace/OpenSearch/sandbox/libs/dataformat-native/rust +cargo build --release -p opensearch-native-lib # ~5 min +``` + +--- + +## Daily run (~30 min) + +### 1. Get the SQL bundle worktree + +First time: +```bash +cd ~/workspace/sql +git fetch ahkcs feature/ppl-coverage-bundle +DATE=$(date +%Y-%m-%d) +git worktree add "../sql-ppl-coverage-${DATE}" ahkcs/feature/ppl-coverage-bundle +``` + +Reuse same day: +```bash +cd ~/workspace/sql-ppl-coverage-${DATE} +git fetch ahkcs && git rebase ahkcs/feature/ppl-coverage-bundle +``` + +### 2. Publish the SQL plugin to mavenLocal + +`:run` installs `opensearch-sql-plugin:3.7.0.0-SNAPSHOT` from maven coords — +mavenLocal first, then OpenSearch CI snapshots. The SQL plugin isn't on CI +snapshots, so without this step the cluster fails to bootstrap with +`Could not resolve …`. Re-publish whenever the bundle worktree is updated. + +```bash +cd ~/workspace/sql-ppl-coverage-${DATE} +./gradlew publishToMavenLocal # ~1 min, JDK 21+ +``` + +### 3. Bring up cluster (terminal A — leave running) + +```bash +cd ~/workspace/OpenSearch +JAVA_HOME=$(mise where java@temurin-25.0.1+8.0.LTS) ./gradlew :run \ + -Dsandbox.enabled=true \ + -PinstalledPlugins="['opensearch-job-scheduler:3.7.0.0-SNAPSHOT', \ +'analytics-engine', 'parquet-data-format', 'analytics-backend-datafusion', \ +'analytics-backend-lucene', 'composite-engine', \ +'opensearch-sql-plugin:3.7.0.0-SNAPSHOT']" +``` + +Wait for it: +```bash +until curl -sf http://localhost:9200/ >/dev/null; do sleep 4; done +``` + +### 4. Run the report (terminal B) + +```bash +cd ~/workspace/sql-ppl-coverage-${DATE} +rm -rf integ-test/build/test-results/analyticsCompatibility \ + integ-test/build/test-results/analyticsCompatibilityTest \ + integ-test/build/reports/analytics-compatibility + +./gradlew :integ-test:analyticsCompatibilityReport \ + -Dtests.rest.cluster=localhost:9200 \ + -Dtests.cluster=localhost:9300 \ + -Dtests.clustername=runTask +``` + +`BUILD SUCCESSFUL` is expected — failures are the report payload, not a build +break. + +### 5. Open the report + +```bash +open integ-test/build/reports/analytics-compatibility/REPORT.md +``` + +### 6. Tear down + +`Ctrl-C` terminal A. + +--- + +## When something breaks + +| Symptom | Fix | +|---|---| +| `BUILD FAILED … java.io.EOFException … KryoBackedDecoder` | `rm -rf integ-test/build/test-results/analyticsCompatibilityTest` and rerun. | +| Run finishes <10 min, top bucket is `ConnectException` | Cluster died. Tail `~/workspace/OpenSearch/build/testclusters/runTask-0/logs/runTask.log` for `fatal error`; the missing class usually needs a `runtimeOnly` dep added to `analytics-engine/build.gradle`. | +| Top bucket is `No backend can scan all requested fields` (thousands) | Parquet flag isn't reaching `TestUtils`. Confirm worktree is on `ahkcs/feature/ppl-coverage-bundle`. | +| `BUILD FAILED` immediately, log shows `[BUILD] Starting OpenSearch process / Stopping node` | `task.getClusters().clear()` is missing. Re-fetch `feature/ppl-coverage-bundle`. | + +--- + +## What the bundle branch is + +`ahkcs/feature/ppl-coverage-bundle` rebases on +`upstream/feature/mustang-ppl-integration` and adds the report task, parquet +flag, origin classifier, pass-rate fix, and testcluster auto-bind workaround. +Daily upstream sync: `git rebase upstream/feature/mustang-ppl-integration`. diff --git a/docs/dev/ppl-analytics-engine-routing.md b/docs/dev/ppl-analytics-engine-routing.md new file mode 100644 index 00000000000..e2dcf40b686 --- /dev/null +++ b/docs/dev/ppl-analytics-engine-routing.md @@ -0,0 +1,426 @@ +# Routing PPL Commands Through the Analytics-Engine Path + +A guide for wiring an existing PPL command — already working on the v2 / Calcite engine — through the analytics-engine route so that `CalciteXxxCommandIT` passes against the force-routed analytics-engine path, without needing a per-command code change in the SQL plugin. + +This doc is the playbook distilled from landing PPL `fillnull` end-to-end through the analytics engine ([opensearch-project/OpenSearch#21472][pr-fillnull]). Pair with [`ppl-commands.md`](ppl-commands.md) (which covers adding a *new* PPL command on the v2 / Calcite engine). + +[pr-fillnull]: https://github.com/opensearch-project/OpenSearch/pull/21472 + +## Goal + +For an existing PPL command `Xxx` whose `CalciteXxxCommandIT` currently passes on the v2 / Calcite engine but fails on the analytics-engine route, drive the analytics-engine route to parity. Two acceptance criteria: + +1. `CalciteXxxCommandIT` (in `opensearch-project/sql`) passes when run via `:integ-test:integTestRemote` with `-Dtests.analytics.force_routing=true -Dtests.analytics.parquet_indices=true`. +2. A self-contained `XxxCommandIT` lives in `opensearch-project/OpenSearch` under `sandbox/qa/analytics-engine-rest`, exercising the command through `POST /_analytics/ppl` against a parquet-backed dataset. CI for OpenSearch core can verify the analytics-engine path without checking out the SQL plugin. + +The QA IT in OpenSearch core is the long-term verification surface — once it lands, the SQL-plugin IT becomes a duplicate maintained for the v2 / Calcite path. + +## Mental model — three layers + +A PPL query on the analytics-engine route passes through three independent layers: + +``` +PPL text + │ + ▼ +[ Layer 1 ] SQL plugin lowering (opensearch-project/sql) + │ PPLSyntaxParser → AstBuilder → CalciteRelNodeVisitor → RelNode tree + │ + ▼ +[ Layer 2 ] Analytics-engine planner (opensearch-project/OpenSearch · sandbox/plugins/analytics-engine) + │ Capability registry, OpenSearch{Project,Filter,Aggregate}Rule, + │ AnnotatedProjectExpression, fragment driver + │ + ▼ +[ Layer 3 ] Backend (opensearch-project/OpenSearch · sandbox/plugins/analytics-backend-datafusion) + Substrait conversion via isthmus → DataFusion native execution +``` + +Knowing which layer a missing function lives in tells you the effort: + +| Bucket | Symptom (failure stack frame) | Layer | Effort | +|---|---|---|---| +| 1 (S0) | `OpenSearchProjectRule` / `OpenSearchFilterRule` throws `No backend supports scalar function [X] among [datafusion]` | Layer 2 capability registry | One line | +| 2 (S1) | Substrait isthmus throws `Unable to convert call X` (the function exists in DataFusion but the call shape doesn't match) | Layer 2 adapter | Small adapter | +| 3 (S1–M) | DataFusion runtime throws (function doesn't exist natively) | Layer 3 UDF | Half-day–multi-day Rust | + +**Layer 1 is almost always already done** for any command that works on the v2 Calcite engine — `CalciteRelNodeVisitor.visitXxx` produces a clean RelNode that's the same regardless of which backend executes it. Verify by reading the v2 unit test (`CalcitePPLXxxTest` in `ppl/src/test/java/.../calcite/`); the printed `LogicalPlan` shows exactly what reaches Layer 2. + +## Local environment setup + +### One-time + +1. **JDK 25** — `sandbox/libs/dataformat-native` targets JDK 25 (FFM API). Without it you'll see `error: release version 25 not supported`. + ```bash + # mise / asdf / sdkman + export JAVA_HOME=/path/to/temurin-25.0.x + export PATH=$JAVA_HOME/bin:$PATH + ``` +2. **Native lib built** — confirm `OpenSearch/sandbox/libs/dataformat-native/rust/target/release/libopensearch_native.dylib` (macOS) or `.so` (Linux) exists. Built automatically by `:run`/`:integTest` if missing, but a stale build is the most common source of weird FFM errors. Rebuild explicitly via the cargo target if in doubt. +3. **`gh` CLI** authenticated against your OpenSearch fork for opening PRs. + +### Repos + +| Repo | Branch convention | Purpose | +|---|---|---| +| `opensearch-project/OpenSearch` | `feature/` off `upstream/main` | Where the actual fix lands (analytics-engine and DataFusion backend code) | +| `opensearch-project/sql` | `feature/` off `ahkcs/feature/ppl-coverage-bundle` until the analytics-compatibility test task lands upstream; thereafter off `upstream/feature/mustang-ppl-integration` | Test-infrastructure work, IT carryovers (rare for new functions once QA-side ITs exist) | + +In the OpenSearch checkout: `feature/mustang-fillnull-command` was the dev branch; `pr/fillnull-poc` was the rebased clean branch I pushed for the PR. Use the same split when working on a new function — keep your dev branch with extra debugging/printf commits, then cherry-pick the clean ones onto a `pr/...` branch for PR submission. + +### Maven-local prerequisites (full publish-and-run flow from scratch) + +The `:run` task pulls `opensearch-job-scheduler:3.7.0.0-SNAPSHOT` and `opensearch-sql-plugin:3.7.0.0-SNAPSHOT` from `~/.m2/repository`. If those aren't published yet, the cluster bootstrap fails with `Could not resolve ...` or `Failed installing file:.../opensearch-sql-plugin...zip`. End-to-end: + +```bash +# 1. Publish OpenSearch core + sandbox plugins to maven local +cd /path/to/OpenSearch +JAVA_HOME=/path/to/temurin-25 ./gradlew publishToMavenLocal -Dsandbox.enabled=true + +# 2. Publish SQL plugin to maven local (any JDK 21+ works for the SQL repo) +cd /path/to/sql +./gradlew publishToMavenLocal + +# 3. (Optional) Publish opensearch-job-scheduler from its own repo if not already in maven local +# Most devs already have this from prior work — check via: +ls ~/.m2/repository/org/opensearch/plugin/opensearch-job-scheduler/3.7.0.0-SNAPSHOT/ + +# 4. Run the server with all required plugins (back in OpenSearch) +cd /path/to/OpenSearch +JAVA_HOME=/path/to/temurin-25 ./gradlew :run -Dsandbox.enabled=true \ + -PinstalledPlugins="['opensearch-job-scheduler:3.7.0.0-SNAPSHOT', \ + 'analytics-engine', 'parquet-data-format', 'analytics-backend-datafusion', \ + 'analytics-backend-lucene', 'composite-engine', \ + 'opensearch-sql-plugin:3.7.0.0-SNAPSHOT']" +``` + +**`tests.jvm.argline` and `--debug-server-jvm`.** The user-facing Mustang task brief shows: +```bash +-Dtests.jvm.argline="-Djava.library.path=$(pwd)/sandbox/libs/dataformat-native/rust/target/release \ + -Dopensearch.experimental.feature.pluggable.dataformat.enabled=true" \ +--debug-server-jvm +``` +You don't need `-Dtests.jvm.argline` for `:run` — `OpenSearch/gradle/run.gradle` already injects the native-lib path and the `pluggable.dataformat.enabled` flag automatically when `parquet-data-format` or `analytics-backend-datafusion` is in `installedPlugins` (lines 93–101). The flag is only needed for `:internalClusterTest` style tasks where the test JVM (not the cluster JVM) loads the native bridge — committed to `gradle.properties` for those use cases. + +`--debug-server-jvm` is the JVM-debug-attach flag for `:run`. Add it when you want to attach a debugger on port 5005; omit for normal runs. + +**Plugin install order matters.** `opensearch-sql-plugin` declares `analytics-engine` and `opensearch-job-scheduler` as `extendedPlugins`, so those must be installed first. Putting `opensearch-sql-plugin` first in the list yields: +``` +IllegalArgumentException: Missing plugin [analytics-engine], dependency of [opensearch-sql] +``` + +## The development loop (Bucket 1 / S0) + +> **Branch base for the SQL plugin checkout.** Until the +> `analyticsCompatibilityTest` task and `tests.analytics.{force_routing,parquet_indices}` +> system properties land in `upstream/feature/mustang-ppl-integration`, base +> your SQL plugin work on `ahkcs/feature/ppl-coverage-bundle` (which rebases on +> top of the integration branch and adds those bits). See +> [`ppl-analytics-engine-coverage-sop.md`](ppl-analytics-engine-coverage-sop.md) +> for the standalone daily-report recipe. + +### 1. Triage — find the failure + +Start an OpenSearch test cluster locally and run the SQL-plugin's `CalciteXxxCommandIT` through the analytics-engine route. The cluster needs all 7 sandbox plugins. + +```bash +# Terminal A — OpenSearch checkout, JDK 25 +cd /path/to/OpenSearch +JAVA_HOME=/path/to/temurin-25 ./gradlew :run -Dsandbox.enabled=true \ + -PinstalledPlugins="['opensearch-job-scheduler:3.7.0.0-SNAPSHOT', \ + 'analytics-engine', 'parquet-data-format', 'analytics-backend-datafusion', \ + 'analytics-backend-lucene', 'composite-engine', \ + 'opensearch-sql-plugin:3.7.0.0-SNAPSHOT']" +``` + +> **Plugin order matters.** `opensearch-sql-plugin` declares `analytics-engine` and `opensearch-job-scheduler` as `extendedPlugins`, so those must install first. The order above is correct. +> **Maven local prerequisites.** `opensearch-sql-plugin:3.7.0.0-SNAPSHOT` and `opensearch-job-scheduler:3.7.0.0-SNAPSHOT` need to be in `~/.m2/repository`. Build/publish from their respective repos via `./gradlew publishToMavenLocal -Dsandbox.enabled=true` once. + +```bash +# Terminal B — sql checkout, any JDK 21+ +cd /path/to/sql +./gradlew :integ-test:integTestRemote \ + -Dtests.rest.cluster=localhost:9200 \ + -Dtests.cluster=localhost:9300 \ + -Dtests.clustername=runTask \ + -Dtests.analytics.force_routing=true \ + -Dtests.analytics.parquet_indices=true \ + --tests "org.opensearch.sql.calcite.remote.CalciteXxxCommandIT" +``` + +The standard `:integTestRemote` runs against the externally-managed `:run` cluster; the two `tests.analytics.*` system properties are what flip the route. JUnit XML lands in `integ-test/build/test-results/integTestRemote/`. + +**Why those two flags exist:** +- `tests.analytics.force_routing=true` → `PPLIntegTestCase.init()` flips the cluster setting `plugins.calcite.analytics.force_routing=true`, which makes `RestUnifiedQueryAction.isAnalyticsIndex()` return `true` unconditionally so every PPL query routes through the analytics-engine path regardless of the index name pattern. +- `tests.analytics.parquet_indices=true` → `TestUtils.createIndexByRestClient` injects the parquet/composite settings on every index it creates: + ```java + indexSettings.put("number_of_shards", 1); + indexSettings.put("pluggable.dataformat.enabled", true); + indexSettings.put("pluggable.dataformat", "composite"); + indexSettings.put("composite.primary_data_format", "parquet"); + ``` + Without parquet-backed storage, the analytics-engine planner fails with `IllegalStateException: No backend can scan all requested fields on index [...]` — DataFusion can't read Lucene-backed indices. + +Without those two flags the same `:integTestRemote` invocation runs the v2 / Calcite path — that's the comparison baseline. + +**Note on `:integ-test:analyticsCompatibilityTest` / `analyticsCompatibilityReport`.** These tasks (defined on `ahkcs/feature/ppl-coverage-bundle`) are *report-generation* helpers — they sweep every PPL IT through the analytics path with `ignoreFailures=true` and bucket the failures into a markdown report (see [`ppl-analytics-engine-coverage-sop.md`](ppl-analytics-engine-coverage-sop.md)). They are not needed for verifying a specific command — `:integTestRemote --tests "..."` with the two system properties is the right primitive for per-command work. + +### 2. Read the failure + +```bash +grep -E "] among [datafusion] + at OpenSearchProjectRule.annotateExpr(OpenSearchProjectRule.java:123) +``` + +**Bucket 2:** +``` +Unable to convert call ANNOTATED_PROJECT_EXPR(fp64?) +``` +or +``` +io.substrait.isthmus.SubstraitRelVisitorException: cannot convert ... +``` + +If the failure is something else entirely (e.g. `Project rule encountered unmarked child [LogicalJoin]`), the command depends on a non-S0 planner gap (joins, sub-queries, etc.) — file a separate issue, don't try to wedge it into the function PR. + +### 3. Wire the function (Bucket 1 / S0) + +For a Bucket-1 / S0 fix, edit one file: + +`OpenSearch/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DataFusionAnalyticsBackendPlugin.java` + +```java +private static final Set STANDARD_PROJECT_OPS = Set.of( + ScalarFunction.COALESCE, + ScalarFunction.CEIL, + ScalarFunction. // ← add here +); +``` + +`STANDARD_PROJECT_OPS` is fanned out across `SUPPORTED_FIELD_TYPES` and the backend's supported formats by the `projectCapabilities()` override. The constant for `` must already exist in `sandbox/libs/analytics-framework/src/main/java/org/opensearch/analytics/spi/ScalarFunction.java` — if it doesn't, add it (see "Adding a new ScalarFunction enum constant" below). + +**No Rust-side, convertor, or Substrait-extension changes needed for Bucket 1.** DataFusion's native runtime executes the call directly via the default Substrait extension catalog already loaded by `DataFusionPlugin.loadSubstraitExtensions`. + +### Adding a new `ScalarFunction` enum constant + +If the function isn't yet in the `ScalarFunction` enum: + +1. Open `sandbox/libs/analytics-framework/src/main/java/org/opensearch/analytics/spi/ScalarFunction.java`. +2. Pick the right `Category` (`COMPARISON`, `STRING`, `MATH`, `SCALAR`, etc.). +3. Pick the right `SqlKind`. If Calcite has a dedicated `SqlKind` for the function (e.g. `SqlKind.COALESCE`, `SqlKind.CEIL`), use it. Otherwise use `SqlKind.OTHER` and rely on `fromSqlFunction` (which matches by name; the enum constant name must match `SqlFunction.getName().toUpperCase(Locale.ROOT)`). + +```java +NEW_FUNCTION(Category.MATH, SqlKind.OTHER), +``` + +### 4. Verify locally + +Restart the cluster (the `:run` task rebuilds plugins automatically from source on restart), then re-run the IT. + +```bash +# Terminal A — Ctrl-C, then re-run :run +# Terminal B — re-run :integTestRemote with the two analytics-engine system properties +``` + +Or, faster: kill the gradle wrapper PID and re-launch. + +### 5. Add the QA IT in OpenSearch core + +This is the one-time investment that pays off across every future function: a self-contained IT in `sandbox/qa/analytics-engine-rest` that exercises the command through `POST /_analytics/ppl`, no SQL plugin needed. + +#### 5a. Dataset resources + +Pick or add a dataset under `sandbox/qa/analytics-engine-rest/src/test/resources/datasets//`: + +``` +datasets// + ├── mapping.json ← OpenSearch mapping with a "settings" block containing "number_of_shards" + └── bulk.json ← NDJSON bulk body, MUST end with a trailing newline (see "Common pitfalls") +``` + +`DatasetProvisioner.injectParquetSettings` injects parquet/composite settings adjacent to `"number_of_shards"`, so the mapping's `settings` block must contain that key. **Existing datasets to reuse:** `clickbench` (43-column wide table, dates, lots of types), `calcs` (fillnull-friendly: nullable int/double/keyword/boolean/date columns; verbose enough for `value=` syntax tests). Add a new dataset only if neither fits. + +#### 5b. Test class + +Place under `sandbox/qa/analytics-engine-rest/src/test/java/org/opensearch/analytics/qa/CommandIT.java`. Mirror the v2 / Calcite IT in the SQL plugin (`integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteXxxCommandIT.java`) one-test-method-to-one — same query strings, same expected rows. + +```java +public class FillNullCommandIT extends AnalyticsRestTestCase { + + private static final Dataset DATASET = new Dataset("calcs", "calcs"); + private static boolean dataProvisioned = false; + + /** + * Lazily provision the dataset on first invocation. Must be called inside a test method + * (not setUp) — OpenSearchRestTestCase's static client() is not initialized until after + * @BeforeClass but is reliably available inside test bodies. + */ + private void ensureDataProvisioned() throws IOException { + if (dataProvisioned == false) { + DatasetProvisioner.provision(client(), DATASET); + dataProvisioned = true; + } + } + + public void testFillNullSameValueOneField() throws IOException { + assertRows( + "source=" + DATASET.indexName + " | fields str2, num0 | fillnull with -1 in num0", + row("one", 12.3), + row("two", -12.3), + // … + row(null, -1) + ); + } + + // ── helpers ──────────────────────────────────────────────────────────── + + private static List row(Object... values) { + return Arrays.asList(values); + } + + @SafeVarargs + @SuppressWarnings("varargs") + private final void assertRows(String ppl, List... expected) throws IOException { + Map response = executePpl(ppl); + @SuppressWarnings("unchecked") + List> actualRows = (List>) response.get("rows"); + assertNotNull(actualRows); + assertEquals(expected.length, actualRows.size()); + for (int i = 0; i < expected.length; i++) { + List want = expected[i]; + List got = actualRows.get(i); + assertEquals(want.size(), got.size()); + for (int j = 0; j < want.size(); j++) { + assertCellEquals(want.get(j), got.get(j)); + } + } + } + + private Map executePpl(String ppl) throws IOException { + ensureDataProvisioned(); + Request request = new Request("POST", "/_analytics/ppl"); + request.setJsonEntity("{\"query\": \"" + escapeJson(ppl) + "\"}"); + Response response = client().performRequest(request); + return assertOkAndParse(response, "PPL: " + ppl); + } + + /** Numeric-tolerant cell comparison — JSON parsing returns Integer/Long/Double interchangeably. */ + private static void assertCellEquals(Object expected, Object actual) { + if (expected == null || actual == null) { assertEquals(expected, actual); return; } + if (expected instanceof Number && actual instanceof Number) { + if (Double.compare(((Number) expected).doubleValue(), ((Number) actual).doubleValue()) != 0) { + fail("expected <" + expected + "> but was <" + actual + ">"); + } + return; + } + assertEquals(expected, actual); + } +} +``` + +For tests that should fail with a planner-side validation error (e.g. type-incompatibility errors raised in `CalciteRelNodeVisitor` preflight), use a sibling `assertErrorContains` helper that catches `ResponseException` and asserts on the response body substring. + +#### 5c. Run the QA IT + +```bash +cd /path/to/OpenSearch +JAVA_HOME=/path/to/temurin-25 ./gradlew \ + :sandbox:qa:analytics-engine-rest:integTest \ + -Dsandbox.enabled=true \ + --tests "*CommandIT" +``` + +The QA module boots its own self-contained test cluster with all 7 sandbox plugins; no need for a `:run` cluster running. JUnit XML lands at `sandbox/qa/analytics-engine-rest/build/test-results/integTest/TEST-org.opensearch.analytics.qa.CommandIT.xml`. + +### 6. Run the sandbox-wide `check` before pushing + +Always run this before opening or updating a PR: + +```bash +cd /path/to/OpenSearch +JAVA_HOME=/path/to/temurin-25 ./gradlew check -p sandbox -Dsandbox.enabled=true +``` + +This is the sandbox-tree equivalent of `./gradlew check` and runs everything OpenSearch core CI runs over the sandbox subprojects: compile (`-Werror`), unit tests, integ tests where applicable, `forbiddenApis`, `licenseHeaders`, `spotless`/`checkstyle`, `dependencyLicenses`, `thirdPartyAudit`, etc. CI fails on any of these — running locally first avoids the rebase-and-force-push churn when CI catches a missing license header or a forbidden API call. + +Common things this surfaces that focused IT runs miss: +- Missing Apache 2.0 license headers on new files (`licenseHeaders`) +- Code-style violations on the new IT class (`checkstyle`/`spotless`) +- An import that pulls in a forbidden API (e.g. `java.util.Date`, `Optional.get()` in some places) +- A unit test in a sibling module broken by a planner change + +Expect 5–15 min on a warm build, longer on a clean one. If `check` is green, the PR's CI almost always is too. + +## When you need more than capabilities (Bucket 2 / S1) + +If the planner accepts the function but Substrait conversion fails (`Unable to convert call ...`): + +1. Check whether `OpenSearchProjectRule.annotateExpr` is wrapping a sub-call you weren't expecting. The recursive `AnnotatedProjectExpression` strip in `OpenSearchProject.stripAnnotations` should handle every wrapper, but if the function lives somewhere other than a `Project` (e.g. `Filter`, `Aggregate`), the corresponding rel's `stripAnnotations` may not be recursive yet — same fix applies (`RexShuttle` that recursively unwraps). +2. If the call shape itself is wrong for DataFusion (e.g. PPL emits a 3-arg call where DataFusion expects 2), add a `ScalarFunctionAdapter` in the DataFusion backend that rewrites the call into the equivalent DF-native shape. See `BackendPlanAdapter` and the `ScalarFunctionAdapter` SPI for the pattern. + +## When the runtime doesn't support the function (Bucket 3 / M) + +Implement a UDF in `sandbox/plugins/analytics-backend-datafusion/rust/src/udf.rs` and register it in `DataFusionService` startup. Roughly half a day per UDF; see existing UDF registrations for the boilerplate. + +## Common pitfalls + +- **`refresh=wait_for` hangs >60s on parquet-backed indices.** `analytics-backend-lucene`'s `LuceneCommitter.getSafeCommitInfo` is a `TODO:: with index deleter` stub. The QA module's `DatasetProvisioner` already uses `refresh=true` to sidestep this. The SQL plugin's `loadDataByRestClient` has a workaround keyed on `tests.analytics.parquet_indices=true`. Symptom: first test in a class run fails with `IllegalStateException: Failed to perform request` and elapsed time exactly 60s. +- **Bulk file missing trailing newline.** `DatasetProvisioner` loads `bulk.json` via `BufferedReader.lines().collect(joining("\n"))`, which drops the final newline. The bulk endpoint requires a trailing `\n`. Workaround: ensure your `bulk.json` has at least two trailing newlines (one row of `\n\n`), or the joined output will be missing the terminator and the bulk fails with `The bulk request must be terminated by a newline [\n]`. +- **`-Werror` blocks `@SafeVarargs`-eligible methods.** OpenSearch's javac is `-Werror`. Generic varargs assertion helpers need `@SafeVarargs @SuppressWarnings("varargs") private final void ...` (the `final` keyword is required for `@SafeVarargs` on instance methods; `private` alone won't compile until JDK 9+, and even then `final` is the safe choice). +- **Plugin install order during `:run`.** `opensearch-sql-plugin` extends `analytics-engine` and `opensearch-job-scheduler` — install both before the SQL plugin or you'll see `Missing plugin [analytics-engine], dependency of [opensearch-sql]`. +- **Gradle 9.x cached transform corruption.** Occasionally `:integTestRemote` / `:analyticsCompatibilityTest` will fail with `The contents of the immutable workspace ... have been modified`. Fix: `rm -rf ~/.gradle/caches/9.x/transforms/` and re-run. +- **Number-format pitfalls in QA assertions.** `assertCellEquals` must use `Double.compare(e.doubleValue(), a.doubleValue())` — Jackson parsing returns `Integer`/`Long`/`Double` based on JSON shape, and naive `.equals()` on cross-type numbers fails even when values are equal. +- **`ScalarFunction.fromSqlKind` returns null for `SqlKind.OTHER`.** If your function uses `SqlKind.OTHER`, the planner falls back to name-based resolution via `fromSqlFunction`. Make sure the enum constant name matches the function's SQL name uppercased. + +## PR shape + +For a single Bucket-1 function, a clean PR against `opensearch-project/OpenSearch:main` is: + +| Commit | Files | Purpose | +|---|---|---| +| 1 | `DataFusionAnalyticsBackendPlugin.java` | Add the `ScalarFunction.` to `STANDARD_PROJECT_OPS` | +| 2 | `CommandIT.java`, `datasets//{mapping,bulk}.json` (new or reused) | QA IT exercising the function via `POST /_analytics/ppl` | + +If you also need a planner fix for the same function, prepend it as commit 0 (e.g. fillnull's nested-strip fix). + +PR description should: +- State the bucket (S0 / S1 / M). +- Show the failure mode the fix addresses (paste the cluster-log stack trace). +- List the test file count: `CalciteXxxCommandIT` v2-side go from `N/M → M/M`, and `XxxCommandIT` QA-side is `M/M`. +- Frame it as the ongoing pattern: future Bucket-1 functions are one-line additions to `STANDARD_PROJECT_OPS`. + +**Before pushing**, run the sandbox-wide check (see Step 6 in the dev loop): +```bash +./gradlew check -p sandbox -Dsandbox.enabled=true +``` + +## Files to read once before starting + +- `OpenSearch/sandbox/libs/analytics-framework/src/main/java/org/opensearch/analytics/spi/ScalarFunction.java` +- `OpenSearch/sandbox/libs/analytics-framework/src/main/java/org/opensearch/analytics/spi/{Project,Filter,Aggregate}Capability.java` +- `OpenSearch/sandbox/libs/analytics-framework/src/main/java/org/opensearch/analytics/spi/BackendCapabilityProvider.java` +- `OpenSearch/sandbox/plugins/analytics-engine/src/main/java/org/opensearch/analytics/planner/CapabilityRegistry.java` +- `OpenSearch/sandbox/plugins/analytics-engine/src/main/java/org/opensearch/analytics/planner/rules/OpenSearchProjectRule.java` +- `OpenSearch/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DataFusionAnalyticsBackendPlugin.java` +- `OpenSearch/sandbox/qa/analytics-engine-rest/src/test/java/org/opensearch/analytics/qa/{AnalyticsRestTestCase,DatasetProvisioner,Dataset}.java` +- `sql/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java` — find the `visitXxx` for your command to confirm the RelNode shape it produces +- `sql/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLTest.java` — the printed `LogicalPlan` shows exactly what reaches Layer 2 + +## Reference: fillnull as the worked example + +The fillnull walkthrough that produced this guide: +- PR: [opensearch-project/OpenSearch#21472][pr-fillnull] +- Function: `COALESCE` (Layer 3 wire-up via `STANDARD_PROJECT_OPS`) +- Adjacent: `CEIL` (one fillnull test uses `with ceil(...) in ...`) +- Planner fix bundled: recursive `AnnotatedProjectExpression` strip in `OpenSearchProject.stripAnnotations` (general bug exposed by nested project calls; surfaced once both COALESCE and CEIL became project-capable) +- Result: `CalciteFillNullCommandIT` 2/13 → 13/13; QA-side `FillNullCommandIT` 13/13 + +The IT-infrastructure scaffolding (`tests.analytics.force_routing`, `tests.analytics.parquet_indices`, parquet-backed index helper in `TestUtils.makeParquetBacked`) was carried over from [`ahkcs/sql@29339c9c`](https://github.com/ahkcs/sql/commit/29339c9c0316d5bad43b5bcb13cc91627b54b1d9) on `feature/mustang-bin-command` — that commit is the canonical reference for the SQL-side test infra. The bundle branch's `analyticsCompatibilityTest` / `analyticsCompatibilityReport` Gradle tasks are report-generation helpers built on top of those system properties; per-command verification just needs the two `-D` flags on `:integTestRemote`. Once a function ships its QA-side IT in `sandbox/qa/analytics-engine-rest`, the SQL-side IT route becomes verification-only (the QA IT is the source of truth). From 24c529181f96e0e6263b6f2f518593038f58d565 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 6 May 2026 14:30:20 -0700 Subject: [PATCH 2/4] =?UTF-8?q?Drop=20dev=20docs=20from=20this=20PR=20?= =?UTF-8?q?=E2=80=94=20track=20separately?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These local routing-and-coverage notes are useful but belong in their own review thread (or stay as untracked working notes), not in the unified-context fix. Keeping them on disk via .gitignore-style untrack so the PR stays focused on the single api/ change. Signed-off-by: Kai Huang --- docs/dev/ppl-analytics-engine-coverage-sop.md | 152 ------- docs/dev/ppl-analytics-engine-routing.md | 426 ------------------ 2 files changed, 578 deletions(-) delete mode 100644 docs/dev/ppl-analytics-engine-coverage-sop.md delete mode 100644 docs/dev/ppl-analytics-engine-routing.md diff --git a/docs/dev/ppl-analytics-engine-coverage-sop.md b/docs/dev/ppl-analytics-engine-coverage-sop.md deleted file mode 100644 index 710feb5f724..00000000000 --- a/docs/dev/ppl-analytics-engine-coverage-sop.md +++ /dev/null @@ -1,152 +0,0 @@ -# SOP — PPL Analytics-Engine IT Coverage Report - -Run all PPL ITs through the analytics-engine path against an externally-managed -cluster, get a bucketed markdown report. Daily. - -Output: `integ-test/build/reports/analytics-compatibility/REPORT.md` - ---- - -## One-time setup - -JDK 25 + JDK 21, Rust stable, ~15 GB free disk. - -```bash -mkdir -p ~/workspace && cd ~/workspace -git clone git@github.com:opensearch-project/OpenSearch.git -git clone git@github.com:opensearch-project/sql.git -( cd OpenSearch && git remote add upstream https://github.com/opensearch-project/OpenSearch.git ) -( cd sql && git remote add ahkcs git@github.com:ahkcs/sql.git ) -``` - -**OpenSearch — track `upstream/main` + apply commons-text patch:** - -`upstream/main` already has the full analytics-engine sandbox plugin set -(`analytics-engine`, `analytics-backend-{datafusion,lucene}`, `composite-engine`, -`dsl-query-executor`, `parquet-data-format`) and the `dataformat-native` rust -crate. Track main; rebase daily. - -```bash -cd ~/workspace/OpenSearch -git fetch upstream main && git checkout -B main upstream/main -``` - -In `sandbox/plugins/analytics-engine/build.gradle`, add next to the existing -`commons-math3` runtimeOnly: - -```groovy -runtimeOnly "org.apache.commons:commons-text:1.11.0" -``` - -`commons-text` is only listed under `resolutionStrategy.force` on main, which -pins the version but doesn't bundle the jar — without this `runtimeOnly` line -the cluster dies on `LevenshteinDistance` the first time a query reaches -Calcite's `SqlFunctions.`. Track upstreaming; once it lands, drop this -patch. - -**Build the native lib (one-time + after rust changes):** - -The cluster JVM loads `libopensearch_native.dylib` (rust + JNI/FFM) at startup -for DataFusion execution, parquet I/O, and native repository ops. Without it -the cluster crashes in `NativeBridge.`. - -```bash -cd ~/workspace/OpenSearch/sandbox/libs/dataformat-native/rust -cargo build --release -p opensearch-native-lib # ~5 min -``` - ---- - -## Daily run (~30 min) - -### 1. Get the SQL bundle worktree - -First time: -```bash -cd ~/workspace/sql -git fetch ahkcs feature/ppl-coverage-bundle -DATE=$(date +%Y-%m-%d) -git worktree add "../sql-ppl-coverage-${DATE}" ahkcs/feature/ppl-coverage-bundle -``` - -Reuse same day: -```bash -cd ~/workspace/sql-ppl-coverage-${DATE} -git fetch ahkcs && git rebase ahkcs/feature/ppl-coverage-bundle -``` - -### 2. Publish the SQL plugin to mavenLocal - -`:run` installs `opensearch-sql-plugin:3.7.0.0-SNAPSHOT` from maven coords — -mavenLocal first, then OpenSearch CI snapshots. The SQL plugin isn't on CI -snapshots, so without this step the cluster fails to bootstrap with -`Could not resolve …`. Re-publish whenever the bundle worktree is updated. - -```bash -cd ~/workspace/sql-ppl-coverage-${DATE} -./gradlew publishToMavenLocal # ~1 min, JDK 21+ -``` - -### 3. Bring up cluster (terminal A — leave running) - -```bash -cd ~/workspace/OpenSearch -JAVA_HOME=$(mise where java@temurin-25.0.1+8.0.LTS) ./gradlew :run \ - -Dsandbox.enabled=true \ - -PinstalledPlugins="['opensearch-job-scheduler:3.7.0.0-SNAPSHOT', \ -'analytics-engine', 'parquet-data-format', 'analytics-backend-datafusion', \ -'analytics-backend-lucene', 'composite-engine', \ -'opensearch-sql-plugin:3.7.0.0-SNAPSHOT']" -``` - -Wait for it: -```bash -until curl -sf http://localhost:9200/ >/dev/null; do sleep 4; done -``` - -### 4. Run the report (terminal B) - -```bash -cd ~/workspace/sql-ppl-coverage-${DATE} -rm -rf integ-test/build/test-results/analyticsCompatibility \ - integ-test/build/test-results/analyticsCompatibilityTest \ - integ-test/build/reports/analytics-compatibility - -./gradlew :integ-test:analyticsCompatibilityReport \ - -Dtests.rest.cluster=localhost:9200 \ - -Dtests.cluster=localhost:9300 \ - -Dtests.clustername=runTask -``` - -`BUILD SUCCESSFUL` is expected — failures are the report payload, not a build -break. - -### 5. Open the report - -```bash -open integ-test/build/reports/analytics-compatibility/REPORT.md -``` - -### 6. Tear down - -`Ctrl-C` terminal A. - ---- - -## When something breaks - -| Symptom | Fix | -|---|---| -| `BUILD FAILED … java.io.EOFException … KryoBackedDecoder` | `rm -rf integ-test/build/test-results/analyticsCompatibilityTest` and rerun. | -| Run finishes <10 min, top bucket is `ConnectException` | Cluster died. Tail `~/workspace/OpenSearch/build/testclusters/runTask-0/logs/runTask.log` for `fatal error`; the missing class usually needs a `runtimeOnly` dep added to `analytics-engine/build.gradle`. | -| Top bucket is `No backend can scan all requested fields` (thousands) | Parquet flag isn't reaching `TestUtils`. Confirm worktree is on `ahkcs/feature/ppl-coverage-bundle`. | -| `BUILD FAILED` immediately, log shows `[BUILD] Starting OpenSearch process / Stopping node` | `task.getClusters().clear()` is missing. Re-fetch `feature/ppl-coverage-bundle`. | - ---- - -## What the bundle branch is - -`ahkcs/feature/ppl-coverage-bundle` rebases on -`upstream/feature/mustang-ppl-integration` and adds the report task, parquet -flag, origin classifier, pass-rate fix, and testcluster auto-bind workaround. -Daily upstream sync: `git rebase upstream/feature/mustang-ppl-integration`. diff --git a/docs/dev/ppl-analytics-engine-routing.md b/docs/dev/ppl-analytics-engine-routing.md deleted file mode 100644 index e2dcf40b686..00000000000 --- a/docs/dev/ppl-analytics-engine-routing.md +++ /dev/null @@ -1,426 +0,0 @@ -# Routing PPL Commands Through the Analytics-Engine Path - -A guide for wiring an existing PPL command — already working on the v2 / Calcite engine — through the analytics-engine route so that `CalciteXxxCommandIT` passes against the force-routed analytics-engine path, without needing a per-command code change in the SQL plugin. - -This doc is the playbook distilled from landing PPL `fillnull` end-to-end through the analytics engine ([opensearch-project/OpenSearch#21472][pr-fillnull]). Pair with [`ppl-commands.md`](ppl-commands.md) (which covers adding a *new* PPL command on the v2 / Calcite engine). - -[pr-fillnull]: https://github.com/opensearch-project/OpenSearch/pull/21472 - -## Goal - -For an existing PPL command `Xxx` whose `CalciteXxxCommandIT` currently passes on the v2 / Calcite engine but fails on the analytics-engine route, drive the analytics-engine route to parity. Two acceptance criteria: - -1. `CalciteXxxCommandIT` (in `opensearch-project/sql`) passes when run via `:integ-test:integTestRemote` with `-Dtests.analytics.force_routing=true -Dtests.analytics.parquet_indices=true`. -2. A self-contained `XxxCommandIT` lives in `opensearch-project/OpenSearch` under `sandbox/qa/analytics-engine-rest`, exercising the command through `POST /_analytics/ppl` against a parquet-backed dataset. CI for OpenSearch core can verify the analytics-engine path without checking out the SQL plugin. - -The QA IT in OpenSearch core is the long-term verification surface — once it lands, the SQL-plugin IT becomes a duplicate maintained for the v2 / Calcite path. - -## Mental model — three layers - -A PPL query on the analytics-engine route passes through three independent layers: - -``` -PPL text - │ - ▼ -[ Layer 1 ] SQL plugin lowering (opensearch-project/sql) - │ PPLSyntaxParser → AstBuilder → CalciteRelNodeVisitor → RelNode tree - │ - ▼ -[ Layer 2 ] Analytics-engine planner (opensearch-project/OpenSearch · sandbox/plugins/analytics-engine) - │ Capability registry, OpenSearch{Project,Filter,Aggregate}Rule, - │ AnnotatedProjectExpression, fragment driver - │ - ▼ -[ Layer 3 ] Backend (opensearch-project/OpenSearch · sandbox/plugins/analytics-backend-datafusion) - Substrait conversion via isthmus → DataFusion native execution -``` - -Knowing which layer a missing function lives in tells you the effort: - -| Bucket | Symptom (failure stack frame) | Layer | Effort | -|---|---|---|---| -| 1 (S0) | `OpenSearchProjectRule` / `OpenSearchFilterRule` throws `No backend supports scalar function [X] among [datafusion]` | Layer 2 capability registry | One line | -| 2 (S1) | Substrait isthmus throws `Unable to convert call X` (the function exists in DataFusion but the call shape doesn't match) | Layer 2 adapter | Small adapter | -| 3 (S1–M) | DataFusion runtime throws (function doesn't exist natively) | Layer 3 UDF | Half-day–multi-day Rust | - -**Layer 1 is almost always already done** for any command that works on the v2 Calcite engine — `CalciteRelNodeVisitor.visitXxx` produces a clean RelNode that's the same regardless of which backend executes it. Verify by reading the v2 unit test (`CalcitePPLXxxTest` in `ppl/src/test/java/.../calcite/`); the printed `LogicalPlan` shows exactly what reaches Layer 2. - -## Local environment setup - -### One-time - -1. **JDK 25** — `sandbox/libs/dataformat-native` targets JDK 25 (FFM API). Without it you'll see `error: release version 25 not supported`. - ```bash - # mise / asdf / sdkman - export JAVA_HOME=/path/to/temurin-25.0.x - export PATH=$JAVA_HOME/bin:$PATH - ``` -2. **Native lib built** — confirm `OpenSearch/sandbox/libs/dataformat-native/rust/target/release/libopensearch_native.dylib` (macOS) or `.so` (Linux) exists. Built automatically by `:run`/`:integTest` if missing, but a stale build is the most common source of weird FFM errors. Rebuild explicitly via the cargo target if in doubt. -3. **`gh` CLI** authenticated against your OpenSearch fork for opening PRs. - -### Repos - -| Repo | Branch convention | Purpose | -|---|---|---| -| `opensearch-project/OpenSearch` | `feature/` off `upstream/main` | Where the actual fix lands (analytics-engine and DataFusion backend code) | -| `opensearch-project/sql` | `feature/` off `ahkcs/feature/ppl-coverage-bundle` until the analytics-compatibility test task lands upstream; thereafter off `upstream/feature/mustang-ppl-integration` | Test-infrastructure work, IT carryovers (rare for new functions once QA-side ITs exist) | - -In the OpenSearch checkout: `feature/mustang-fillnull-command` was the dev branch; `pr/fillnull-poc` was the rebased clean branch I pushed for the PR. Use the same split when working on a new function — keep your dev branch with extra debugging/printf commits, then cherry-pick the clean ones onto a `pr/...` branch for PR submission. - -### Maven-local prerequisites (full publish-and-run flow from scratch) - -The `:run` task pulls `opensearch-job-scheduler:3.7.0.0-SNAPSHOT` and `opensearch-sql-plugin:3.7.0.0-SNAPSHOT` from `~/.m2/repository`. If those aren't published yet, the cluster bootstrap fails with `Could not resolve ...` or `Failed installing file:.../opensearch-sql-plugin...zip`. End-to-end: - -```bash -# 1. Publish OpenSearch core + sandbox plugins to maven local -cd /path/to/OpenSearch -JAVA_HOME=/path/to/temurin-25 ./gradlew publishToMavenLocal -Dsandbox.enabled=true - -# 2. Publish SQL plugin to maven local (any JDK 21+ works for the SQL repo) -cd /path/to/sql -./gradlew publishToMavenLocal - -# 3. (Optional) Publish opensearch-job-scheduler from its own repo if not already in maven local -# Most devs already have this from prior work — check via: -ls ~/.m2/repository/org/opensearch/plugin/opensearch-job-scheduler/3.7.0.0-SNAPSHOT/ - -# 4. Run the server with all required plugins (back in OpenSearch) -cd /path/to/OpenSearch -JAVA_HOME=/path/to/temurin-25 ./gradlew :run -Dsandbox.enabled=true \ - -PinstalledPlugins="['opensearch-job-scheduler:3.7.0.0-SNAPSHOT', \ - 'analytics-engine', 'parquet-data-format', 'analytics-backend-datafusion', \ - 'analytics-backend-lucene', 'composite-engine', \ - 'opensearch-sql-plugin:3.7.0.0-SNAPSHOT']" -``` - -**`tests.jvm.argline` and `--debug-server-jvm`.** The user-facing Mustang task brief shows: -```bash --Dtests.jvm.argline="-Djava.library.path=$(pwd)/sandbox/libs/dataformat-native/rust/target/release \ - -Dopensearch.experimental.feature.pluggable.dataformat.enabled=true" \ ---debug-server-jvm -``` -You don't need `-Dtests.jvm.argline` for `:run` — `OpenSearch/gradle/run.gradle` already injects the native-lib path and the `pluggable.dataformat.enabled` flag automatically when `parquet-data-format` or `analytics-backend-datafusion` is in `installedPlugins` (lines 93–101). The flag is only needed for `:internalClusterTest` style tasks where the test JVM (not the cluster JVM) loads the native bridge — committed to `gradle.properties` for those use cases. - -`--debug-server-jvm` is the JVM-debug-attach flag for `:run`. Add it when you want to attach a debugger on port 5005; omit for normal runs. - -**Plugin install order matters.** `opensearch-sql-plugin` declares `analytics-engine` and `opensearch-job-scheduler` as `extendedPlugins`, so those must be installed first. Putting `opensearch-sql-plugin` first in the list yields: -``` -IllegalArgumentException: Missing plugin [analytics-engine], dependency of [opensearch-sql] -``` - -## The development loop (Bucket 1 / S0) - -> **Branch base for the SQL plugin checkout.** Until the -> `analyticsCompatibilityTest` task and `tests.analytics.{force_routing,parquet_indices}` -> system properties land in `upstream/feature/mustang-ppl-integration`, base -> your SQL plugin work on `ahkcs/feature/ppl-coverage-bundle` (which rebases on -> top of the integration branch and adds those bits). See -> [`ppl-analytics-engine-coverage-sop.md`](ppl-analytics-engine-coverage-sop.md) -> for the standalone daily-report recipe. - -### 1. Triage — find the failure - -Start an OpenSearch test cluster locally and run the SQL-plugin's `CalciteXxxCommandIT` through the analytics-engine route. The cluster needs all 7 sandbox plugins. - -```bash -# Terminal A — OpenSearch checkout, JDK 25 -cd /path/to/OpenSearch -JAVA_HOME=/path/to/temurin-25 ./gradlew :run -Dsandbox.enabled=true \ - -PinstalledPlugins="['opensearch-job-scheduler:3.7.0.0-SNAPSHOT', \ - 'analytics-engine', 'parquet-data-format', 'analytics-backend-datafusion', \ - 'analytics-backend-lucene', 'composite-engine', \ - 'opensearch-sql-plugin:3.7.0.0-SNAPSHOT']" -``` - -> **Plugin order matters.** `opensearch-sql-plugin` declares `analytics-engine` and `opensearch-job-scheduler` as `extendedPlugins`, so those must install first. The order above is correct. -> **Maven local prerequisites.** `opensearch-sql-plugin:3.7.0.0-SNAPSHOT` and `opensearch-job-scheduler:3.7.0.0-SNAPSHOT` need to be in `~/.m2/repository`. Build/publish from their respective repos via `./gradlew publishToMavenLocal -Dsandbox.enabled=true` once. - -```bash -# Terminal B — sql checkout, any JDK 21+ -cd /path/to/sql -./gradlew :integ-test:integTestRemote \ - -Dtests.rest.cluster=localhost:9200 \ - -Dtests.cluster=localhost:9300 \ - -Dtests.clustername=runTask \ - -Dtests.analytics.force_routing=true \ - -Dtests.analytics.parquet_indices=true \ - --tests "org.opensearch.sql.calcite.remote.CalciteXxxCommandIT" -``` - -The standard `:integTestRemote` runs against the externally-managed `:run` cluster; the two `tests.analytics.*` system properties are what flip the route. JUnit XML lands in `integ-test/build/test-results/integTestRemote/`. - -**Why those two flags exist:** -- `tests.analytics.force_routing=true` → `PPLIntegTestCase.init()` flips the cluster setting `plugins.calcite.analytics.force_routing=true`, which makes `RestUnifiedQueryAction.isAnalyticsIndex()` return `true` unconditionally so every PPL query routes through the analytics-engine path regardless of the index name pattern. -- `tests.analytics.parquet_indices=true` → `TestUtils.createIndexByRestClient` injects the parquet/composite settings on every index it creates: - ```java - indexSettings.put("number_of_shards", 1); - indexSettings.put("pluggable.dataformat.enabled", true); - indexSettings.put("pluggable.dataformat", "composite"); - indexSettings.put("composite.primary_data_format", "parquet"); - ``` - Without parquet-backed storage, the analytics-engine planner fails with `IllegalStateException: No backend can scan all requested fields on index [...]` — DataFusion can't read Lucene-backed indices. - -Without those two flags the same `:integTestRemote` invocation runs the v2 / Calcite path — that's the comparison baseline. - -**Note on `:integ-test:analyticsCompatibilityTest` / `analyticsCompatibilityReport`.** These tasks (defined on `ahkcs/feature/ppl-coverage-bundle`) are *report-generation* helpers — they sweep every PPL IT through the analytics path with `ignoreFailures=true` and bucket the failures into a markdown report (see [`ppl-analytics-engine-coverage-sop.md`](ppl-analytics-engine-coverage-sop.md)). They are not needed for verifying a specific command — `:integTestRemote --tests "..."` with the two system properties is the right primitive for per-command work. - -### 2. Read the failure - -```bash -grep -E "] among [datafusion] - at OpenSearchProjectRule.annotateExpr(OpenSearchProjectRule.java:123) -``` - -**Bucket 2:** -``` -Unable to convert call ANNOTATED_PROJECT_EXPR(fp64?) -``` -or -``` -io.substrait.isthmus.SubstraitRelVisitorException: cannot convert ... -``` - -If the failure is something else entirely (e.g. `Project rule encountered unmarked child [LogicalJoin]`), the command depends on a non-S0 planner gap (joins, sub-queries, etc.) — file a separate issue, don't try to wedge it into the function PR. - -### 3. Wire the function (Bucket 1 / S0) - -For a Bucket-1 / S0 fix, edit one file: - -`OpenSearch/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DataFusionAnalyticsBackendPlugin.java` - -```java -private static final Set STANDARD_PROJECT_OPS = Set.of( - ScalarFunction.COALESCE, - ScalarFunction.CEIL, - ScalarFunction. // ← add here -); -``` - -`STANDARD_PROJECT_OPS` is fanned out across `SUPPORTED_FIELD_TYPES` and the backend's supported formats by the `projectCapabilities()` override. The constant for `` must already exist in `sandbox/libs/analytics-framework/src/main/java/org/opensearch/analytics/spi/ScalarFunction.java` — if it doesn't, add it (see "Adding a new ScalarFunction enum constant" below). - -**No Rust-side, convertor, or Substrait-extension changes needed for Bucket 1.** DataFusion's native runtime executes the call directly via the default Substrait extension catalog already loaded by `DataFusionPlugin.loadSubstraitExtensions`. - -### Adding a new `ScalarFunction` enum constant - -If the function isn't yet in the `ScalarFunction` enum: - -1. Open `sandbox/libs/analytics-framework/src/main/java/org/opensearch/analytics/spi/ScalarFunction.java`. -2. Pick the right `Category` (`COMPARISON`, `STRING`, `MATH`, `SCALAR`, etc.). -3. Pick the right `SqlKind`. If Calcite has a dedicated `SqlKind` for the function (e.g. `SqlKind.COALESCE`, `SqlKind.CEIL`), use it. Otherwise use `SqlKind.OTHER` and rely on `fromSqlFunction` (which matches by name; the enum constant name must match `SqlFunction.getName().toUpperCase(Locale.ROOT)`). - -```java -NEW_FUNCTION(Category.MATH, SqlKind.OTHER), -``` - -### 4. Verify locally - -Restart the cluster (the `:run` task rebuilds plugins automatically from source on restart), then re-run the IT. - -```bash -# Terminal A — Ctrl-C, then re-run :run -# Terminal B — re-run :integTestRemote with the two analytics-engine system properties -``` - -Or, faster: kill the gradle wrapper PID and re-launch. - -### 5. Add the QA IT in OpenSearch core - -This is the one-time investment that pays off across every future function: a self-contained IT in `sandbox/qa/analytics-engine-rest` that exercises the command through `POST /_analytics/ppl`, no SQL plugin needed. - -#### 5a. Dataset resources - -Pick or add a dataset under `sandbox/qa/analytics-engine-rest/src/test/resources/datasets//`: - -``` -datasets// - ├── mapping.json ← OpenSearch mapping with a "settings" block containing "number_of_shards" - └── bulk.json ← NDJSON bulk body, MUST end with a trailing newline (see "Common pitfalls") -``` - -`DatasetProvisioner.injectParquetSettings` injects parquet/composite settings adjacent to `"number_of_shards"`, so the mapping's `settings` block must contain that key. **Existing datasets to reuse:** `clickbench` (43-column wide table, dates, lots of types), `calcs` (fillnull-friendly: nullable int/double/keyword/boolean/date columns; verbose enough for `value=` syntax tests). Add a new dataset only if neither fits. - -#### 5b. Test class - -Place under `sandbox/qa/analytics-engine-rest/src/test/java/org/opensearch/analytics/qa/CommandIT.java`. Mirror the v2 / Calcite IT in the SQL plugin (`integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteXxxCommandIT.java`) one-test-method-to-one — same query strings, same expected rows. - -```java -public class FillNullCommandIT extends AnalyticsRestTestCase { - - private static final Dataset DATASET = new Dataset("calcs", "calcs"); - private static boolean dataProvisioned = false; - - /** - * Lazily provision the dataset on first invocation. Must be called inside a test method - * (not setUp) — OpenSearchRestTestCase's static client() is not initialized until after - * @BeforeClass but is reliably available inside test bodies. - */ - private void ensureDataProvisioned() throws IOException { - if (dataProvisioned == false) { - DatasetProvisioner.provision(client(), DATASET); - dataProvisioned = true; - } - } - - public void testFillNullSameValueOneField() throws IOException { - assertRows( - "source=" + DATASET.indexName + " | fields str2, num0 | fillnull with -1 in num0", - row("one", 12.3), - row("two", -12.3), - // … - row(null, -1) - ); - } - - // ── helpers ──────────────────────────────────────────────────────────── - - private static List row(Object... values) { - return Arrays.asList(values); - } - - @SafeVarargs - @SuppressWarnings("varargs") - private final void assertRows(String ppl, List... expected) throws IOException { - Map response = executePpl(ppl); - @SuppressWarnings("unchecked") - List> actualRows = (List>) response.get("rows"); - assertNotNull(actualRows); - assertEquals(expected.length, actualRows.size()); - for (int i = 0; i < expected.length; i++) { - List want = expected[i]; - List got = actualRows.get(i); - assertEquals(want.size(), got.size()); - for (int j = 0; j < want.size(); j++) { - assertCellEquals(want.get(j), got.get(j)); - } - } - } - - private Map executePpl(String ppl) throws IOException { - ensureDataProvisioned(); - Request request = new Request("POST", "/_analytics/ppl"); - request.setJsonEntity("{\"query\": \"" + escapeJson(ppl) + "\"}"); - Response response = client().performRequest(request); - return assertOkAndParse(response, "PPL: " + ppl); - } - - /** Numeric-tolerant cell comparison — JSON parsing returns Integer/Long/Double interchangeably. */ - private static void assertCellEquals(Object expected, Object actual) { - if (expected == null || actual == null) { assertEquals(expected, actual); return; } - if (expected instanceof Number && actual instanceof Number) { - if (Double.compare(((Number) expected).doubleValue(), ((Number) actual).doubleValue()) != 0) { - fail("expected <" + expected + "> but was <" + actual + ">"); - } - return; - } - assertEquals(expected, actual); - } -} -``` - -For tests that should fail with a planner-side validation error (e.g. type-incompatibility errors raised in `CalciteRelNodeVisitor` preflight), use a sibling `assertErrorContains` helper that catches `ResponseException` and asserts on the response body substring. - -#### 5c. Run the QA IT - -```bash -cd /path/to/OpenSearch -JAVA_HOME=/path/to/temurin-25 ./gradlew \ - :sandbox:qa:analytics-engine-rest:integTest \ - -Dsandbox.enabled=true \ - --tests "*CommandIT" -``` - -The QA module boots its own self-contained test cluster with all 7 sandbox plugins; no need for a `:run` cluster running. JUnit XML lands at `sandbox/qa/analytics-engine-rest/build/test-results/integTest/TEST-org.opensearch.analytics.qa.CommandIT.xml`. - -### 6. Run the sandbox-wide `check` before pushing - -Always run this before opening or updating a PR: - -```bash -cd /path/to/OpenSearch -JAVA_HOME=/path/to/temurin-25 ./gradlew check -p sandbox -Dsandbox.enabled=true -``` - -This is the sandbox-tree equivalent of `./gradlew check` and runs everything OpenSearch core CI runs over the sandbox subprojects: compile (`-Werror`), unit tests, integ tests where applicable, `forbiddenApis`, `licenseHeaders`, `spotless`/`checkstyle`, `dependencyLicenses`, `thirdPartyAudit`, etc. CI fails on any of these — running locally first avoids the rebase-and-force-push churn when CI catches a missing license header or a forbidden API call. - -Common things this surfaces that focused IT runs miss: -- Missing Apache 2.0 license headers on new files (`licenseHeaders`) -- Code-style violations on the new IT class (`checkstyle`/`spotless`) -- An import that pulls in a forbidden API (e.g. `java.util.Date`, `Optional.get()` in some places) -- A unit test in a sibling module broken by a planner change - -Expect 5–15 min on a warm build, longer on a clean one. If `check` is green, the PR's CI almost always is too. - -## When you need more than capabilities (Bucket 2 / S1) - -If the planner accepts the function but Substrait conversion fails (`Unable to convert call ...`): - -1. Check whether `OpenSearchProjectRule.annotateExpr` is wrapping a sub-call you weren't expecting. The recursive `AnnotatedProjectExpression` strip in `OpenSearchProject.stripAnnotations` should handle every wrapper, but if the function lives somewhere other than a `Project` (e.g. `Filter`, `Aggregate`), the corresponding rel's `stripAnnotations` may not be recursive yet — same fix applies (`RexShuttle` that recursively unwraps). -2. If the call shape itself is wrong for DataFusion (e.g. PPL emits a 3-arg call where DataFusion expects 2), add a `ScalarFunctionAdapter` in the DataFusion backend that rewrites the call into the equivalent DF-native shape. See `BackendPlanAdapter` and the `ScalarFunctionAdapter` SPI for the pattern. - -## When the runtime doesn't support the function (Bucket 3 / M) - -Implement a UDF in `sandbox/plugins/analytics-backend-datafusion/rust/src/udf.rs` and register it in `DataFusionService` startup. Roughly half a day per UDF; see existing UDF registrations for the boilerplate. - -## Common pitfalls - -- **`refresh=wait_for` hangs >60s on parquet-backed indices.** `analytics-backend-lucene`'s `LuceneCommitter.getSafeCommitInfo` is a `TODO:: with index deleter` stub. The QA module's `DatasetProvisioner` already uses `refresh=true` to sidestep this. The SQL plugin's `loadDataByRestClient` has a workaround keyed on `tests.analytics.parquet_indices=true`. Symptom: first test in a class run fails with `IllegalStateException: Failed to perform request` and elapsed time exactly 60s. -- **Bulk file missing trailing newline.** `DatasetProvisioner` loads `bulk.json` via `BufferedReader.lines().collect(joining("\n"))`, which drops the final newline. The bulk endpoint requires a trailing `\n`. Workaround: ensure your `bulk.json` has at least two trailing newlines (one row of `\n\n`), or the joined output will be missing the terminator and the bulk fails with `The bulk request must be terminated by a newline [\n]`. -- **`-Werror` blocks `@SafeVarargs`-eligible methods.** OpenSearch's javac is `-Werror`. Generic varargs assertion helpers need `@SafeVarargs @SuppressWarnings("varargs") private final void ...` (the `final` keyword is required for `@SafeVarargs` on instance methods; `private` alone won't compile until JDK 9+, and even then `final` is the safe choice). -- **Plugin install order during `:run`.** `opensearch-sql-plugin` extends `analytics-engine` and `opensearch-job-scheduler` — install both before the SQL plugin or you'll see `Missing plugin [analytics-engine], dependency of [opensearch-sql]`. -- **Gradle 9.x cached transform corruption.** Occasionally `:integTestRemote` / `:analyticsCompatibilityTest` will fail with `The contents of the immutable workspace ... have been modified`. Fix: `rm -rf ~/.gradle/caches/9.x/transforms/` and re-run. -- **Number-format pitfalls in QA assertions.** `assertCellEquals` must use `Double.compare(e.doubleValue(), a.doubleValue())` — Jackson parsing returns `Integer`/`Long`/`Double` based on JSON shape, and naive `.equals()` on cross-type numbers fails even when values are equal. -- **`ScalarFunction.fromSqlKind` returns null for `SqlKind.OTHER`.** If your function uses `SqlKind.OTHER`, the planner falls back to name-based resolution via `fromSqlFunction`. Make sure the enum constant name matches the function's SQL name uppercased. - -## PR shape - -For a single Bucket-1 function, a clean PR against `opensearch-project/OpenSearch:main` is: - -| Commit | Files | Purpose | -|---|---|---| -| 1 | `DataFusionAnalyticsBackendPlugin.java` | Add the `ScalarFunction.` to `STANDARD_PROJECT_OPS` | -| 2 | `CommandIT.java`, `datasets//{mapping,bulk}.json` (new or reused) | QA IT exercising the function via `POST /_analytics/ppl` | - -If you also need a planner fix for the same function, prepend it as commit 0 (e.g. fillnull's nested-strip fix). - -PR description should: -- State the bucket (S0 / S1 / M). -- Show the failure mode the fix addresses (paste the cluster-log stack trace). -- List the test file count: `CalciteXxxCommandIT` v2-side go from `N/M → M/M`, and `XxxCommandIT` QA-side is `M/M`. -- Frame it as the ongoing pattern: future Bucket-1 functions are one-line additions to `STANDARD_PROJECT_OPS`. - -**Before pushing**, run the sandbox-wide check (see Step 6 in the dev loop): -```bash -./gradlew check -p sandbox -Dsandbox.enabled=true -``` - -## Files to read once before starting - -- `OpenSearch/sandbox/libs/analytics-framework/src/main/java/org/opensearch/analytics/spi/ScalarFunction.java` -- `OpenSearch/sandbox/libs/analytics-framework/src/main/java/org/opensearch/analytics/spi/{Project,Filter,Aggregate}Capability.java` -- `OpenSearch/sandbox/libs/analytics-framework/src/main/java/org/opensearch/analytics/spi/BackendCapabilityProvider.java` -- `OpenSearch/sandbox/plugins/analytics-engine/src/main/java/org/opensearch/analytics/planner/CapabilityRegistry.java` -- `OpenSearch/sandbox/plugins/analytics-engine/src/main/java/org/opensearch/analytics/planner/rules/OpenSearchProjectRule.java` -- `OpenSearch/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DataFusionAnalyticsBackendPlugin.java` -- `OpenSearch/sandbox/qa/analytics-engine-rest/src/test/java/org/opensearch/analytics/qa/{AnalyticsRestTestCase,DatasetProvisioner,Dataset}.java` -- `sql/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java` — find the `visitXxx` for your command to confirm the RelNode shape it produces -- `sql/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLTest.java` — the printed `LogicalPlan` shows exactly what reaches Layer 2 - -## Reference: fillnull as the worked example - -The fillnull walkthrough that produced this guide: -- PR: [opensearch-project/OpenSearch#21472][pr-fillnull] -- Function: `COALESCE` (Layer 3 wire-up via `STANDARD_PROJECT_OPS`) -- Adjacent: `CEIL` (one fillnull test uses `with ceil(...) in ...`) -- Planner fix bundled: recursive `AnnotatedProjectExpression` strip in `OpenSearchProject.stripAnnotations` (general bug exposed by nested project calls; surfaced once both COALESCE and CEIL became project-capable) -- Result: `CalciteFillNullCommandIT` 2/13 → 13/13; QA-side `FillNullCommandIT` 13/13 - -The IT-infrastructure scaffolding (`tests.analytics.force_routing`, `tests.analytics.parquet_indices`, parquet-backed index helper in `TestUtils.makeParquetBacked`) was carried over from [`ahkcs/sql@29339c9c`](https://github.com/ahkcs/sql/commit/29339c9c0316d5bad43b5bcb13cc91627b54b1d9) on `feature/mustang-bin-command` — that commit is the canonical reference for the SQL-side test infra. The bundle branch's `analyticsCompatibilityTest` / `analyticsCompatibilityReport` Gradle tasks are report-generation helpers built on top of those system properties; per-command verification just needs the two `-D` flags on `:integTestRemote`. Once a function ships its QA-side IT in `sandbox/qa/analytics-engine-rest`, the SQL-side IT route becomes verification-only (the QA IT is the source of truth). From 1d4c1130e8f1b90044c9a0e98cec442f53fd21b5 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 6 May 2026 14:31:38 -0700 Subject: [PATCH 3/4] Address @dai-chen: add CALCITE_ENGINE_ENABLED to default settings map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @dai-chen suggested either adding the setting to the default list at UnifiedQueryContext.java:123 or passing it via the .setting() API on the builder, since only the unified PPL path requires it. Going with option 1 (default-list entry) — it composes the same way every other planning-required default does, and avoids forcing every caller (the production REST handler and every IT) to remember a single magic .setting() call. The IT at UnifiedQueryOpenSearchIT.java:51 was the precedent for the .setting() approach but only one IT needs it; the production caller (RestUnifiedQueryAction) wouldn't have a natural place to wire this without either repeating it everywhere or routing through a helper. Removes the conditional in buildSettings().getSettingValue() — the default map now carries the value the same way QUERY_SIZE_LIMIT, PPL_SUBSEARCH_MAXOUT, and PPL_JOIN_SUBSEARCH_MAXOUT do. Signed-off-by: Kai Huang --- .../sql/api/UnifiedQueryContext.java | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/api/src/main/java/org/opensearch/sql/api/UnifiedQueryContext.java b/api/src/main/java/org/opensearch/sql/api/UnifiedQueryContext.java index 864b2610897..5670b5fe8b5 100644 --- a/api/src/main/java/org/opensearch/sql/api/UnifiedQueryContext.java +++ b/api/src/main/java/org/opensearch/sql/api/UnifiedQueryContext.java @@ -5,6 +5,7 @@ package org.opensearch.sql.api; +import static org.opensearch.sql.common.setting.Settings.Key.CALCITE_ENGINE_ENABLED; import static org.opensearch.sql.common.setting.Settings.Key.PPL_JOIN_SUBSEARCH_MAXOUT; import static org.opensearch.sql.common.setting.Settings.Key.PPL_SUBSEARCH_MAXOUT; import static org.opensearch.sql.common.setting.Settings.Key.QUERY_SIZE_LIMIT; @@ -115,13 +116,21 @@ public static class Builder { /** * Setting values with defaults from SysLimit.DEFAULT. Only includes planning-required settings * to avoid coupling with OpenSearchSettings. + * + *

{@link Settings.Key#CALCITE_ENGINE_ENABLED} defaults to {@code true} here because the + * unified query path is by definition Calcite-based — every query reaching this context flows + * through Calcite's planner, never the v2 engine. The PPL {@link + * org.opensearch.sql.api.parser.PPLQueryParser} reuses the v2 {@code AstBuilder}, which gates + * Calcite-only commands (e.g. {@code visitTableCommand}) on this setting; without the default, + * those commands fail at parse time even when the cluster setting is true. */ private final Map settings = new HashMap( Map.of( QUERY_SIZE_LIMIT, SysLimit.DEFAULT.querySizeLimit(), PPL_SUBSEARCH_MAXOUT, SysLimit.DEFAULT.subsearchLimit(), - PPL_JOIN_SUBSEARCH_MAXOUT, SysLimit.DEFAULT.joinSubsearchLimit())); + PPL_JOIN_SUBSEARCH_MAXOUT, SysLimit.DEFAULT.joinSubsearchLimit(), + CALCITE_ENGINE_ENABLED, true)); /** * Sets the query language frontend to be used. @@ -225,14 +234,6 @@ private Settings buildSettings() { @Override @SuppressWarnings("unchecked") public T getSettingValue(Key key) { - // The unified query path is always Calcite-based — every query reaching this context - // goes through Calcite's planner, never the v2 engine. Default - // {@code plugins.calcite.enabled} to true so AstBuilder gates (e.g. visitTableCommand) - // that protect v2-only code paths don't block valid analytics-engine queries when the - // underlying setting hasn't been propagated through Builder#setting. - if (key == Key.CALCITE_ENGINE_ENABLED && !settings.containsKey(key)) { - return (T) Boolean.TRUE; - } return (T) settings.get(key); } From 4ade92fe0e4cedf8b748e70346081c8b4c25cbec Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 6 May 2026 15:58:16 -0700 Subject: [PATCH 4/4] CalcitePPLRenameIT: switch to column-name-aware row matcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, {@code verifyStandardDataRows} (and a handful of bespoke {@code verifyDataRows} calls in this file) compared rows positionally — they assumed the engine emits columns in the {@code _source} iteration order the v2 / Lucene path produces. Under the analytics-engine route the parquet-backed reader returns columns in storage order, so the same 4 canonical state_country rows came back as {@code [70,"USA",4,"Jake", "California",2023]} instead of {@code [Jake,USA,California,4,2023,70]} and 12 of 22 tests failed despite the data being identical. Replace with a column-name-keyed expected-row map. The helper reads the actual schema from the response, looks up each canonical value by column name, places it at the corresponding schema position, then defers to {@code verifyDataRows} as before. The contract is identical to the existing {@code verifySchema} matcher — both are set-equality on column names — so the test no longer leaks the engine's emission order into the assertion. Each call site passes the canonical column-name list (with rename substitutions where applicable). Tests that don't rename age keep calling the no-arg form. Both paths now pass: * Analytics-engine route (`tests.analytics.force_routing=true`): 20 / 22 (the remaining 2 are `testRenameInAgg` / `testRenameWithBackticksInAgg`, blocked on the Substrait isthmus AVG-binding follow-up tracked in https://github.com/opensearch-project/OpenSearch/pull/21521). * v2 / Calcite route (default routing): 20 / 22 (same two tests fail with `NoClassDefFoundError: LevenshteinDistance` only when running against the OS-core `:run` cluster — that bundle is missing commons-text. CI's own integ-test cluster bundles commons-text via the SQL plugin's classloader and isn't affected.) Signed-off-by: Kai Huang --- .../calcite/remote/CalcitePPLRenameIT.java | 142 +++++++++++++++--- 1 file changed, 118 insertions(+), 24 deletions(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLRenameIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLRenameIT.java index 6cd0674a2dc..0302a13b159 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLRenameIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLRenameIT.java @@ -14,6 +14,10 @@ import static org.opensearch.sql.util.MatcherUtils.verifySchemaInOrder; import java.io.IOException; +import java.util.LinkedHashMap; +import java.util.Map; +import org.hamcrest.Matcher; +import org.json.JSONArray; import org.json.JSONObject; import org.junit.Test; import org.opensearch.sql.ppl.PPLIntegTestCase; @@ -40,7 +44,7 @@ public void testRename() throws IOException { schema("country", "string"), schema("year", "int"), schema("month", "int")); - verifyStandardDataRows(result); + verifyStandardDataRows(result, "name", "country", "state", "month", "year", "renamed_age"); } @Test @@ -77,7 +81,7 @@ public void testRenameToMetaField() throws IOException { schema("country", "string"), schema("year", "int"), schema("month", "int")); - verifyStandardDataRows(result); + verifyStandardDataRows(result, "name", "country", "state", "month", "year", "_ID"); } @Test @@ -156,7 +160,7 @@ public void testRenameWildcardFields() throws IOException { schema("country", "string"), schema("year", "int"), schema("month", "int")); - verifyStandardDataRows(result); + verifyStandardDataRows(result, "nAME", "country", "state", "month", "year", "age"); } @Test @@ -171,7 +175,7 @@ public void testRenameMultipleWildcardFields() throws IOException { schema("couNTry", "string"), schema("year", "int"), schema("moNTh", "int")); - verifyStandardDataRows(result); + verifyStandardDataRows(result, "name", "couNTry", "state", "moNTh", "year", "age"); } @Test @@ -186,7 +190,7 @@ public void testRenameWildcardPrefix() throws IOException { schema("country", "string"), schema("year", "int"), schema("month", "int")); - verifyStandardDataRows(result); + verifyStandardDataRows(result, "new_na", "country", "state", "month", "year", "age"); } @Test @@ -212,7 +216,7 @@ public void testRenameMultipleWildcards() throws IOException { schema("country", "string"), schema("year", "int"), schema("MoNtH", "int")); - verifyStandardDataRows(result); + verifyStandardDataRows(result, "name", "country", "state", "MoNtH", "year", "age"); } @Test @@ -261,12 +265,14 @@ public void testRenamingToExistingField() throws IOException { schema("country", "string"), schema("year", "int"), schema("month", "int")); - verifyDataRows( + // After `rename name as age`, the original name column overwrites the original age column; + // the (number) age values are gone and only the (string) name values remain under "age". + verifyDataRowsByColumn( result, - rows("Jake", "USA", "California", 4, 2023), - rows("Hello", "USA", "New York", 4, 2023), - rows("John", "Canada", "Ontario", 4, 2023), - rows("Jane", "Canada", "Quebec", 4, 2023)); + rowOf("age", "Jake", "country", "USA", "state", "California", "month", 4, "year", 2023), + rowOf("age", "Hello", "country", "USA", "state", "New York", "month", 4, "year", 2023), + rowOf("age", "John", "country", "Canada", "state", "Ontario", "month", 4, "year", 2023), + rowOf("age", "Jane", "country", "Canada", "state", "Quebec", "month", 4, "year", 2023)); } @Test @@ -296,12 +302,12 @@ public void testRenamingNonExistentFieldToExistingField() throws IOException { schema("country", "string"), schema("year", "int"), schema("month", "int")); - verifyDataRows( + verifyDataRowsByColumn( result, - rows("Jake", "USA", "California", 4, 2023), - rows("Hello", "USA", "New York", 4, 2023), - rows("John", "Canada", "Ontario", 4, 2023), - rows("Jane", "Canada", "Quebec", 4, 2023)); + rowOf("name", "Jake", "country", "USA", "state", "California", "month", 4, "year", 2023), + rowOf("name", "Hello", "country", "USA", "state", "New York", "month", 4, "year", 2023), + rowOf("name", "John", "country", "Canada", "state", "Ontario", "month", 4, "year", 2023), + rowOf("name", "Jane", "country", "Canada", "state", "Quebec", "month", 4, "year", 2023)); } @Test @@ -345,7 +351,7 @@ public void testMultipleRenameWithoutComma() throws IOException { schema("location", "string"), schema("year", "int"), schema("month", "int")); - verifyStandardDataRows(result); + verifyStandardDataRows(result, "user_name", "location", "state", "month", "year", "user_age"); } @Test @@ -363,15 +369,103 @@ public void testRenameMixedCommaAndSpace() throws IOException { schema("location", "string"), schema("year", "int"), schema("month", "int")); - verifyStandardDataRows(result); + verifyStandardDataRows(result, "user_name", "location", "state", "month", "year", "user_age"); + } + + /** + * Build a {@code column -> value} map from interleaved varargs ({@code key1, val1, key2, val2, + * ...}). Preserves insertion order so the expected-row mapping reads naturally at the call site. + */ + private static Map rowOf(Object... pairs) { + if (pairs.length % 2 != 0) { + throw new IllegalArgumentException("rowOf expects an even number of args (key, value, ...)"); + } + Map row = new LinkedHashMap<>(); + for (int i = 0; i < pairs.length; i += 2) { + row.put((String) pairs[i], pairs[i + 1]); + } + return row; } private void verifyStandardDataRows(JSONObject result) { - verifyDataRows( - result, - rows("Jake", "USA", "California", 4, 2023, 70), - rows("Hello", "USA", "New York", 4, 2023, 30), - rows("John", "Canada", "Ontario", 4, 2023, 25), - rows("Jane", "Canada", "Quebec", 4, 2023, 20)); + verifyStandardDataRows(result, "name", "country", "state", "month", "year", "age"); + } + + /** + * Verify the four canonical state_country rows independently of column order. + * + *

The schema check above ({@code verifySchema}) is set-equality on column names; the data row + * check {@code verifyDataRows} is positional. The two paths the analytics-engine route can take + * return columns in different orders (parquet preserves storage order, the v2 / Lucene path + * preserves {@code _source} iteration order), and either is valid given the contract {@code + * verifySchema} declares. To avoid baking either order into the test, this helper takes the + * canonical-position column names as varargs and reorders the canonical row values to match + * whatever column order the response actually returned. + * + * @param result the response JSON + * @param canonicalColumns the column names of the four canonical rows in {@code (name-or-renamed, + * country-or-renamed, state, month, year, age-or-renamed)} order. Pass the rename target + * where applicable. + */ + private void verifyStandardDataRows(JSONObject result, String... canonicalColumns) { + if (canonicalColumns.length != 6) { + throw new IllegalArgumentException( + "verifyStandardDataRows expects 6 canonical column names; got " + + canonicalColumns.length); + } + Object[][] canonicalValues = + new Object[][] { + {"Jake", "USA", "California", 4, 2023, 70}, + {"Hello", "USA", "New York", 4, 2023, 30}, + {"John", "Canada", "Ontario", 4, 2023, 25}, + {"Jane", "Canada", "Quebec", 4, 2023, 20} + }; + Map[] expectedRows = new LinkedHashMap[canonicalValues.length]; + for (int i = 0; i < canonicalValues.length; i++) { + Map row = new LinkedHashMap<>(); + for (int c = 0; c < canonicalColumns.length; c++) { + row.put(canonicalColumns[c], canonicalValues[i][c]); + } + expectedRows[i] = row; + } + verifyDataRowsByColumn(result, expectedRows); + } + + /** + * Match expected rows against the response by column name, ignoring the response's column + * emission order. For each expected row (a {@code column-name -> value} map), the value at each + * schema position is looked up by name. Tests using this helper become engine-order agnostic: a + * parquet-backed response and a Lucene-backed response yield the same assertion outcome as long + * as the column-name-to-value mapping agrees. + */ + @SafeVarargs + @SuppressWarnings("varargs") + private final void verifyDataRowsByColumn( + JSONObject result, Map... expectedRows) { + JSONArray schema = result.getJSONArray("schema"); + int n = schema.length(); + String[] columnOrder = new String[n]; + for (int i = 0; i < n; i++) { + columnOrder[i] = schema.getJSONObject(i).getString("name"); + } + @SuppressWarnings({"unchecked", "rawtypes"}) + Matcher[] rowMatchers = new Matcher[expectedRows.length]; + for (int r = 0; r < expectedRows.length; r++) { + Object[] reordered = new Object[n]; + for (int c = 0; c < n; c++) { + if (!expectedRows[r].containsKey(columnOrder[c])) { + throw new IllegalArgumentException( + "Expected row at index " + + r + + " is missing canonical value for response column [" + + columnOrder[c] + + "]; provided keys: " + + expectedRows[r].keySet()); + } + reordered[c] = expectedRows[r].get(columnOrder[c]); + } + rowMatchers[r] = rows(reordered); + } + verifyDataRows(result, rowMatchers); } }