From 7ad90beaf727696246a424e5782c5fdd1b69f12f Mon Sep 17 00:00:00 2001 From: Masha Basmanova Date: Tue, 23 Apr 2024 05:01:23 -0700 Subject: [PATCH] Make leading '$.' optional in JSON paths accepted by json_extract[_scalar] Presto functions Summary: json_extract_scalar and json_extract Presto function allow paths like 'foo' and 'foo.bar'. These paths are non-standard. They are enabled via use of Jayway engine, which handles paths without leading '$' by prepending the path with '$.'. Path 'foo' becomes '$.foo'. However, this logic has some unexpected side effects. Path '.foo' becomes '$..foo', which uses deep scan operator '..' and matches 'foo' keys anywhere in the document, rather then just at the top level. This change allows Velox to support paths like 'foo' and 'foo.bar[2].bar', but not '.foo' or '[10]'. These paths are handled as if they contained leading '$.' characters. See https://github.com/prestodb/presto/issues/22589 Part of https://github.com/facebookincubator/velox/issues/7049 Differential Revision: D56465662 --- .../prestosql/json/JsonPathTokenizer.cpp | 36 +++- .../prestosql/json/JsonPathTokenizer.h | 34 +++- .../json/tests/JsonPathTokenizerTest.cpp | 170 ++++++++++-------- .../prestosql/tests/JsonExtractScalarTest.cpp | 35 +++- .../prestosql/tests/JsonFunctionsTest.cpp | 9 + 5 files changed, 197 insertions(+), 87 deletions(-) diff --git a/velox/functions/prestosql/json/JsonPathTokenizer.cpp b/velox/functions/prestosql/json/JsonPathTokenizer.cpp index 93b038783212..1c785b9df6a3 100644 --- a/velox/functions/prestosql/json/JsonPathTokenizer.cpp +++ b/velox/functions/prestosql/json/JsonPathTokenizer.cpp @@ -41,11 +41,35 @@ bool isDotKeyFormat(char c) { } // namespace bool JsonPathTokenizer::reset(std::string_view path) { - if (path.empty() || path[0] != ROOT) { + if (path.empty()) { + index_ = 0; + path_ = {}; + return false; + } + + if (path[0] == ROOT) { + index_ = 1; + path_ = path; + return true; + } + + // Presto supplements basic JsonPath implementation with Jayway engine, + // which allows paths that do not start with '$'. Jayway simply prepends + // such paths with '$.'. This logic supports paths like 'foo' by converting + // them to '$.foo'. However, this logic converts paths like '.foo' into + // '$..foo' which uses deep scan operator (..). This changes the meaning of + // the path in unexpected way. + // + // Here, we allow paths like 'foo', 'foo.bar' and similar. We do not allow + // paths like '.foo' or '[0]'. + + index_ = 0; + if (path[0] == DOT || path[0] == OPEN_BRACKET) { + path_ = {}; return false; } - index_ = 1; path_ = path; + return true; } @@ -54,9 +78,16 @@ bool JsonPathTokenizer::hasNext() const { } std::optional JsonPathTokenizer::getNext() { + if (index_ == 0) { + // We are parsing first token in a path that doesn't start with '$'. In + // this case, we assume the path starts with '$.'. + return matchDotKey(); + } + if (match(DOT)) { return matchDotKey(); } + if (match(OPEN_BRACKET)) { auto token = match(QUOTE) ? matchQuotedSubscriptKey() : matchUnquotedSubscriptKey(); @@ -65,6 +96,7 @@ std::optional JsonPathTokenizer::getNext() { } return token; } + return std::nullopt; } diff --git a/velox/functions/prestosql/json/JsonPathTokenizer.h b/velox/functions/prestosql/json/JsonPathTokenizer.h index 8032e24d7107..dfba062a33b0 100644 --- a/velox/functions/prestosql/json/JsonPathTokenizer.h +++ b/velox/functions/prestosql/json/JsonPathTokenizer.h @@ -21,12 +21,40 @@ namespace facebook::velox::functions { +/// Splits a JSON path into individual tokens. +/// +/// Supports a subset of JSONPath syntax: +/// https://goessner.net/articles/JsonPath/ +/// +/// $ - the root element +/// . or [] - child operator +/// * - wildcard (all objects/elements regardless their names) +/// +/// Supports quoted keys. +/// +/// Notably, doesn't support deep scan, e.g. $..author. +/// +/// The leading '$.' is optional. Paths '$.foo.bar' and 'foo.bar' are +/// equivalent. This is not part of JSONPath syntax, but this is the behavior of +/// Jayway implementation used by Presto. +/// +/// Examples: +/// "$.store.book[0].author" +/// "store.book[0].author" class JsonPathTokenizer { public: + /// Resets the tokenizer to a new path. This method must be called and return + /// true before calling hasNext or getNext. + /// + /// @return true if path is not empty and first character suggests a possibly + /// valid path. bool reset(std::string_view path); + /// Returns true if there are more tokens to be extracted. bool hasNext() const; + /// Returns the next token or std::nullopt if there are no more tokens left or + /// the remaining path is invalid. std::optional getNext(); private: @@ -38,8 +66,12 @@ class JsonPathTokenizer { std::optional matchQuotedSubscriptKey(); - private: + // The index of the next character to process. This is at least one for + // standard paths that start with '$'. This can be zero if 'reset' was called + // with a non-standard path that doesn't have a leading '$'. size_t index_; + + // The path specified in 'reset'. std::string_view path_; }; diff --git a/velox/functions/prestosql/json/tests/JsonPathTokenizerTest.cpp b/velox/functions/prestosql/json/tests/JsonPathTokenizerTest.cpp index a1f278d92de3..12dbb021607f 100644 --- a/velox/functions/prestosql/json/tests/JsonPathTokenizerTest.cpp +++ b/velox/functions/prestosql/json/tests/JsonPathTokenizerTest.cpp @@ -17,41 +17,15 @@ #include "velox/functions/prestosql/json/JsonPathTokenizer.h" #include - #include "gtest/gtest.h" using namespace std::string_literals; -using facebook::velox::functions::JsonPathTokenizer; using TokenList = std::vector; -#define EXPECT_TOKEN_EQ(path, expected) \ - { \ - auto jsonPath = path; \ - auto tokens = getTokens(jsonPath); \ - EXPECT_TRUE(bool(tokens)); \ - EXPECT_EQ(expected, tokens.value()); \ - } - -#define EXPECT_TOKEN_INVALID(path) \ - { \ - auto tokens = getTokens(path); \ - EXPECT_FALSE(bool(tokens)); \ - } +namespace facebook::velox::functions { +namespace { -#define EXPECT_QUOTED_TOKEN_EQ(path, expected) \ - { \ - auto quotedPath = "$[\"" + path + "\"]"; \ - EXPECT_TOKEN_EQ(quotedPath, expected); \ - } - -#define EXPECT_UNQUOTED_TOKEN_INVALID(path) \ - { \ - auto invalidPath = "$." + path; \ - EXPECT_TOKEN_INVALID(invalidPath); \ - } - -// The test is ported from Presto for compatibility -std::optional getTokens(const std::string& path) { +std::optional getTokens(std::string_view path) { JsonPathTokenizer tokenizer; if (!tokenizer.reset(path)) { return std::nullopt; @@ -69,58 +43,100 @@ std::optional getTokens(const std::string& path) { return tokens; } -TEST(JsonPathTokenizerTest, tokenizeTest) { - EXPECT_TOKEN_EQ("$"s, TokenList()); - EXPECT_TOKEN_EQ("$.foo"s, TokenList{"foo"s}); - EXPECT_TOKEN_EQ("$[\"foo\"]"s, TokenList{"foo"s}); - EXPECT_TOKEN_EQ("$[\"foo.bar\"]"s, TokenList{"foo.bar"s}); - EXPECT_TOKEN_EQ("$[42]"s, TokenList{"42"s}); - EXPECT_TOKEN_EQ("$.42"s, TokenList{"42"s}); - EXPECT_TOKEN_EQ("$.42.63"s, (TokenList{"42"s, "63"s})); - EXPECT_TOKEN_EQ( - "$.foo.42.bar.63"s, (TokenList{"foo"s, "42"s, "bar"s, "63"s})); - EXPECT_TOKEN_EQ("$.x.foo"s, (TokenList{"x"s, "foo"s})); - EXPECT_TOKEN_EQ("$.x[\"foo\"]"s, (TokenList{"x"s, "foo"s})); - EXPECT_TOKEN_EQ("$.x[42]"s, (TokenList{"x"s, "42"s})); - EXPECT_TOKEN_EQ("$.foo_42._bar63"s, (TokenList{"foo_42"s, "_bar63"s})); - EXPECT_TOKEN_EQ("$[foo_42][_bar63]"s, (TokenList{"foo_42"s, "_bar63"s})); - EXPECT_TOKEN_EQ("$.foo:42.:bar63"s, (TokenList{"foo:42"s, ":bar63"s})); - EXPECT_TOKEN_EQ( - "$[\"foo:42\"][\":bar63\"]"s, (TokenList{"foo:42"s, ":bar63"s})); - EXPECT_TOKEN_EQ( +// 'path' should start with '$'. +void assertValidPath(std::string_view path, const TokenList& expectedTokens) { + auto tokens = getTokens(path); + + EXPECT_TRUE(tokens.has_value()) << "Invalid JSON path: " << path; + if (!tokens.has_value()) { + return; + } + + EXPECT_EQ(expectedTokens, tokens.value()); +} + +void assertQuotedToken( + const std::string& token, + const TokenList& expectedTokens) { + auto quotedPath = "$[\"" + token + "\"]"; + assertValidPath(quotedPath, expectedTokens); + + auto invalidPath = "$." + token; + EXPECT_FALSE(getTokens(invalidPath)); +} + +// The test is ported from Presto for compatibility. +TEST(JsonPathTokenizerTest, validPaths) { + assertValidPath("$"s, TokenList()); + assertValidPath("$.foo"s, TokenList{"foo"s}); + assertValidPath("$[\"foo\"]"s, TokenList{"foo"s}); + assertValidPath("$[\"foo.bar\"]"s, TokenList{"foo.bar"s}); + assertValidPath("$[42]"s, TokenList{"42"s}); + assertValidPath("$.42"s, TokenList{"42"s}); + assertValidPath("$.42.63"s, TokenList{"42"s, "63"s}); + assertValidPath("$.foo.42.bar.63"s, TokenList{"foo"s, "42"s, "bar"s, "63"s}); + assertValidPath("$.x.foo"s, TokenList{"x"s, "foo"s}); + assertValidPath("$.x[\"foo\"]"s, TokenList{"x"s, "foo"s}); + assertValidPath("$.x[42]"s, TokenList{"x"s, "42"s}); + assertValidPath("$.foo_42._bar63"s, TokenList{"foo_42"s, "_bar63"s}); + assertValidPath("$[foo_42][_bar63]"s, TokenList{"foo_42"s, "_bar63"s}); + assertValidPath("$.foo:42.:bar63"s, TokenList{"foo:42"s, ":bar63"s}); + assertValidPath( + "$[\"foo:42\"][\":bar63\"]"s, TokenList{"foo:42"s, ":bar63"s}); + assertValidPath( "$.store.fruit[*].weight", - (TokenList{"store"s, "fruit"s, "*"s, "weight"s})); - EXPECT_QUOTED_TOKEN_EQ("!@#$%^&*()[]{}/?'"s, TokenList{"!@#$%^&*()[]{}/?'"s}); - EXPECT_UNQUOTED_TOKEN_INVALID("!@#$%^&*()[]{}/?'"s); - EXPECT_QUOTED_TOKEN_EQ("ab\u0001c"s, TokenList{"ab\u0001c"s}); - EXPECT_UNQUOTED_TOKEN_INVALID("ab\u0001c"s); - EXPECT_QUOTED_TOKEN_EQ("ab\0c"s, TokenList{"ab\0c"s}); - EXPECT_UNQUOTED_TOKEN_INVALID("ab\0c"s); - EXPECT_QUOTED_TOKEN_EQ("ab\t\n\rc"s, TokenList{"ab\t\n\rc"s}); - EXPECT_UNQUOTED_TOKEN_INVALID("ab\t\n\rc"s); - EXPECT_QUOTED_TOKEN_EQ("."s, TokenList{"."s}); - EXPECT_UNQUOTED_TOKEN_INVALID("."s); - EXPECT_QUOTED_TOKEN_EQ("$"s, TokenList{"$"s}); - EXPECT_UNQUOTED_TOKEN_INVALID("$"s); - EXPECT_QUOTED_TOKEN_EQ("]"s, TokenList{"]"s}); - EXPECT_UNQUOTED_TOKEN_INVALID("]"s); - EXPECT_QUOTED_TOKEN_EQ("["s, TokenList{"["s}); - EXPECT_UNQUOTED_TOKEN_INVALID("["s); - EXPECT_QUOTED_TOKEN_EQ("'"s, TokenList{"'"s}); - EXPECT_UNQUOTED_TOKEN_INVALID("'"s); - EXPECT_QUOTED_TOKEN_EQ( + TokenList{"store"s, "fruit"s, "*"s, "weight"s}); + assertValidPath( + "$.store.book[*].author", TokenList{"store", "book", "*", "author"}); + assertValidPath("$.store.*", TokenList({"store", "*"})); + + // Paths without leading '$'. + assertValidPath("foo", TokenList({"foo"})); + assertValidPath("foo[12].bar", TokenList({"foo", "12", "bar"})); + assertValidPath("foo.bar.baz", TokenList({"foo", "bar", "baz"})); + + assertQuotedToken("!@#$%^&*()[]{}/?'"s, TokenList{"!@#$%^&*()[]{}/?'"s}); + assertQuotedToken("ab\u0001c"s, TokenList{"ab\u0001c"s}); + assertQuotedToken("ab\0c"s, TokenList{"ab\0c"s}); + assertQuotedToken("ab\t\n\rc"s, TokenList{"ab\t\n\rc"s}); + assertQuotedToken("."s, TokenList{"."s}); + assertQuotedToken("$"s, TokenList{"$"s}); + assertQuotedToken("]"s, TokenList{"]"s}); + assertQuotedToken("["s, TokenList{"["s}); + assertQuotedToken("'"s, TokenList{"'"s}); + assertQuotedToken( "!@#$%^&*(){}[]<>?/|.,`~\r\n\t \0"s, TokenList{"!@#$%^&*(){}[]<>?/|.,`~\r\n\t \0"s}); - EXPECT_UNQUOTED_TOKEN_INVALID("!@#$%^&*(){}[]<>?/|.,`~\r\n\t \0"s); - EXPECT_QUOTED_TOKEN_EQ("a\\\\b\\\""s, TokenList{"a\\b\""s}); - EXPECT_UNQUOTED_TOKEN_INVALID("a\\\\b\\\""s); - EXPECT_QUOTED_TOKEN_EQ("ab\\\"cd\\\"ef"s, TokenList{"ab\"cd\"ef"s}); + assertQuotedToken("a\\\\b\\\""s, TokenList{"a\\b\""s}); + assertQuotedToken("ab\\\"cd\\\"ef"s, TokenList{"ab\"cd\"ef"s}); +} - // backslash not followed by valid escape - EXPECT_TOKEN_INVALID("$[\"a\\ \"]"s); +TEST(JsonPathTokenizerTest, invalidPaths) { + JsonPathTokenizer tokenizer; + + // Empty path. + EXPECT_FALSE(getTokens("")); + + // Backslash not followed by valid escape. + EXPECT_FALSE(getTokens("$[\"a\\ \"]"s)); - // colon in subscript must be quoted - EXPECT_TOKEN_INVALID("$[foo:bar]"s); + // Colon in subscript must be quoted. + EXPECT_FALSE(getTokens("$[foo:bar]"s)); - EXPECT_TOKEN_INVALID("$.store.book["); + // Open bracket without close bracket. + EXPECT_FALSE(getTokens("$.store.book[")); + + // Unsupported deep scan operator. + EXPECT_FALSE(getTokens("$..store")); + EXPECT_FALSE(getTokens("..store")); + EXPECT_FALSE(getTokens("$..[3].foo")); + EXPECT_FALSE(getTokens("..[3].foo")); + + // Paths without leading '$'. + EXPECT_FALSE(getTokens("[1].foo")); + EXPECT_FALSE(getTokens(".[1].foo")); + EXPECT_FALSE(getTokens(".foo.bar.baz")); } + +} // namespace +} // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/tests/JsonExtractScalarTest.cpp b/velox/functions/prestosql/tests/JsonExtractScalarTest.cpp index 5cd98d531614..b62107330599 100644 --- a/velox/functions/prestosql/tests/JsonExtractScalarTest.cpp +++ b/velox/functions/prestosql/tests/JsonExtractScalarTest.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "velox/common/base/tests/GTestUtils.h" #include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h" #include "velox/functions/prestosql/types/JsonType.h" @@ -103,6 +104,10 @@ TEST_F(JsonExtractScalarTest, simple) { jsonExtractScalar( R"([{"k1":[{"k2": ["v1", "v2"]}]}])", "$[0].k1[0].k2[1]"), "v2"); + + // Paths without leading '$'. + EXPECT_EQ(jsonExtractScalar(R"({"k1":"v1"})", "k1"), "v1"); + EXPECT_EQ(jsonExtractScalar(R"({"k1":{"k2": 999}})", "k1.k2"), "999"); } TEST_F(JsonExtractScalarTest, jsonType) { @@ -164,13 +169,29 @@ TEST_F(JsonExtractScalarTest, utf8) { } TEST_F(JsonExtractScalarTest, invalidPath) { - EXPECT_THROW(jsonExtractScalar(R"([0,1,2])", ""), VeloxUserError); - EXPECT_THROW(jsonExtractScalar(R"([0,1,2])", "$[]"), VeloxUserError); - EXPECT_THROW(jsonExtractScalar(R"([0,1,2])", "$[-1]"), VeloxUserError); - EXPECT_THROW(jsonExtractScalar(R"({"k1":"v1"})", "$k1"), VeloxUserError); - EXPECT_THROW(jsonExtractScalar(R"({"k1":"v1"})", "$.k1."), VeloxUserError); - EXPECT_THROW(jsonExtractScalar(R"({"k1":"v1"})", "$.k1]"), VeloxUserError); - EXPECT_THROW(jsonExtractScalar(R"({"k1":"v1)", "$.k1]"), VeloxUserError); + VELOX_ASSERT_THROW(jsonExtractScalar(R"([0,1,2])", ""), "Invalid JSON path"); + VELOX_ASSERT_THROW( + jsonExtractScalar(R"([0,1,2])", "$[]"), "Invalid JSON path"); + VELOX_ASSERT_THROW( + jsonExtractScalar(R"([0,1,2])", "$[-1]"), "Invalid JSON path"); + VELOX_ASSERT_THROW( + jsonExtractScalar(R"({"k1":"v1"})", "$k1"), "Invalid JSON path"); + VELOX_ASSERT_THROW( + jsonExtractScalar(R"({"k1":"v1"})", "$.k1."), "Invalid JSON path"); + VELOX_ASSERT_THROW( + jsonExtractScalar(R"({"k1":"v1"})", "$.k1]"), "Invalid JSON path"); + VELOX_ASSERT_THROW( + jsonExtractScalar(R"({"k1":"v1)", "$.k1]"), "Invalid JSON path"); + + // Paths without leading '$'. + VELOX_ASSERT_THROW(jsonExtractScalar(R"([1,2])", "[0]"), "Invalid JSON path"); + VELOX_ASSERT_THROW( + jsonExtractScalar(R"([1,2])", ".[0]"), "Invalid JSON path"); + VELOX_ASSERT_THROW( + jsonExtractScalar(R"({"k1":"v1"})", ".k1"), "Invalid JSON path"); + VELOX_ASSERT_THROW( + jsonExtractScalar(R"({"k1":{"k2": 999}})", ".k1.k2"), + "Invalid JSON path"); } // simdjson, like Presto java, returns the large number as-is as a string, diff --git a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp index 5c625a11a9df..575482bad953 100644 --- a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp @@ -601,6 +601,15 @@ TEST_F(JsonFunctionsTest, jsonExtract) { jsonExtract(kJson, "$.store.book[*].isbn")); EXPECT_EQ("\"Evelyn Waugh\"", jsonExtract(kJson, "$.store.book[1].author")); + // Paths without leading '$'. + auto json = R"({"x": {"a": 1, "b": [10, 11, 12]} })"; + EXPECT_EQ(R"({"a": 1, "b": [10, 11, 12]})", jsonExtract(json, "x")); + EXPECT_EQ("1", jsonExtract(json, "x.a")); + EXPECT_EQ("[10, 11, 12]", jsonExtract(json, "x.b")); + EXPECT_EQ("12", jsonExtract(json, "x.b[2]")); + EXPECT_EQ(std::nullopt, jsonExtract(json, "x.c")); + EXPECT_EQ(std::nullopt, jsonExtract(json, "x.b[20]")); + // TODO The following paths are supported by Presto via Jayway, but do not // work in Velox yet. Figure out how to add support for these. VELOX_ASSERT_THROW(jsonExtract(kJson, "$..price"), "Invalid JSON path");