Skip to content

Commit

Permalink
Refactor JsonPathTokenizer (facebookincubator#9573)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: facebookincubator#9573

- Replace folly::StringPiece with std::string_view.
- Replace `folly::Expected<std::string, bool>` with `std::optional<std::string>`.
- Replace private methods isDotKeyFormat and isUnquotedBracketKeyFormat with free functions in .cpp file.

Reviewed By: xiaoxmeng

Differential Revision: D56451797

fbshipit-source-id: e035386f731ac2c03f2ec85f7403386ee4fdcdb8
  • Loading branch information
mbasmanova authored and facebook-github-bot committed Apr 23, 2024
1 parent 3279ebd commit 63b4595
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 47 deletions.
2 changes: 1 addition & 1 deletion velox/functions/prestosql/json/JsonExtractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ class JsonExtractor {

while (kTokenizer.hasNext()) {
if (auto token = kTokenizer.getNext()) {
tokens_.push_back(token.value());
tokens_.emplace_back(token.value());
} else {
tokens_.clear();
return false;
Expand Down
59 changes: 31 additions & 28 deletions velox/functions/prestosql/json/JsonPathTokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

namespace facebook::velox::functions {

namespace {
const char ROOT = '$';
const char DOT = '.';
const char COLON = ':';
Expand All @@ -29,7 +30,17 @@ const char UNDER_SCORE = '_';
const char OPEN_BRACKET = '[';
const char CLOSE_BRACKET = ']';

bool JsonPathTokenizer::reset(folly::StringPiece path) {
bool isUnquotedBracketKeyFormat(char c) {
return c == UNDER_SCORE || c == STAR || std::isalnum(c);
}

bool isDotKeyFormat(char c) {
return c == COLON || c == DASH || isUnquotedBracketKeyFormat(c);
}

} // namespace

bool JsonPathTokenizer::reset(std::string_view path) {
if (path.empty() || path[0] != ROOT) {
return false;
}
Expand All @@ -42,87 +53,79 @@ bool JsonPathTokenizer::hasNext() const {
return index_ < path_.size();
}

ParseResult JsonPathTokenizer::getNext() {
std::optional<std::string> JsonPathTokenizer::getNext() {
if (match(DOT)) {
return matchDotKey();
}
if (match(OPEN_BRACKET)) {
auto token =
match(QUOTE) ? matchQuotedSubscriptKey() : matchUnquotedSubscriptKey();
if (!token || !match(CLOSE_BRACKET)) {
return folly::makeUnexpected(false);
return std::nullopt;
}
return token;
}
return folly::makeUnexpected(false);
return std::nullopt;
}

bool JsonPathTokenizer::match(char expected) {
if (index_ < path_.size() && path_[index_] == expected) {
if (hasNext() && path_[index_] == expected) {
index_++;
return true;
}
return false;
}

ParseResult JsonPathTokenizer::matchDotKey() {
std::optional<std::string> JsonPathTokenizer::matchDotKey() {
auto start = index_;
while ((index_ < path_.size()) && isDotKeyFormat(path_[index_])) {
while (hasNext() && isDotKeyFormat(path_[index_])) {
index_++;
}
if (index_ <= start) {
return folly::makeUnexpected(false);
if (index_ == start) {
return std::nullopt;
}
return path_.subpiece(start, index_ - start).str();
return std::string(path_.substr(start, index_ - start));
}

ParseResult JsonPathTokenizer::matchUnquotedSubscriptKey() {
std::optional<std::string> JsonPathTokenizer::matchUnquotedSubscriptKey() {
auto start = index_;
while (index_ < path_.size() && isUnquotedBracketKeyFormat(path_[index_])) {
while (hasNext() && isUnquotedBracketKeyFormat(path_[index_])) {
index_++;
}
if (index_ <= start) {
return folly::makeUnexpected(false);
if (index_ == start) {
return std::nullopt;
}
return path_.subpiece(start, index_ - start).str();
return std::string(path_.substr(start, index_ - start));
}

// Reference Presto logic in
// src/test/java/io/prestosql/operator/scalar/TestJsonExtract.java and
// src/main/java/io/prestosql/operator/scalar/JsonExtract.java
ParseResult JsonPathTokenizer::matchQuotedSubscriptKey() {
std::optional<std::string> JsonPathTokenizer::matchQuotedSubscriptKey() {
bool escaped = false;
std::string token;
while ((index_ < path_.size()) && (escaped || path_[index_] != QUOTE)) {
while (hasNext() && (escaped || path_[index_] != QUOTE)) {
if (escaped) {
if (path_[index_] != QUOTE && path_[index_] != BACK_SLASH) {
return folly::makeUnexpected(false);
return std::nullopt;
}
escaped = false;
token.append(1, path_[index_]);
} else {
if (path_[index_] == BACK_SLASH) {
escaped = true;
} else if (path_[index_] == QUOTE) {
return folly::makeUnexpected(false);
return std::nullopt;
} else {
token.append(1, path_[index_]);
}
}
index_++;
}
if (escaped || token.empty() || !match(QUOTE)) {
return folly::makeUnexpected(false);
return std::nullopt;
}
return token;
}

bool JsonPathTokenizer::isDotKeyFormat(char c) {
return c == COLON || c == DASH || isUnquotedBracketKeyFormat(c);
}

bool JsonPathTokenizer::isUnquotedBracketKeyFormat(char c) {
return c == UNDER_SCORE || c == STAR || std::isalnum(c);
}

} // namespace facebook::velox::functions
21 changes: 8 additions & 13 deletions velox/functions/prestosql/json/JsonPathTokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,36 +16,31 @@

#pragma once

#include <folly/Expected.h>
#include <folly/Range.h>
#include <optional>
#include <string>

namespace facebook::velox::functions {

using ParseResult = folly::Expected<std::string, bool>;

class JsonPathTokenizer {
public:
bool reset(folly::StringPiece path);
bool reset(std::string_view path);

bool hasNext() const;

ParseResult getNext();
std::optional<std::string> getNext();

private:
bool match(char expected);
ParseResult matchDotKey();

ParseResult matchUnquotedSubscriptKey();

ParseResult matchQuotedSubscriptKey();
std::optional<std::string> matchDotKey();

bool isDotKeyFormat(char c);
std::optional<std::string> matchUnquotedSubscriptKey();

bool isUnquotedBracketKeyFormat(char c);
std::optional<std::string> matchQuotedSubscriptKey();

private:
size_t index_;
folly::StringPiece path_;
std::string_view path_;
};

} // namespace facebook::velox::functions
2 changes: 1 addition & 1 deletion velox/functions/prestosql/json/SIMDJsonExtractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ bool SIMDJsonExtractor::tokenize(const std::string& path) {

while (tokenizer.hasNext()) {
if (auto token = tokenizer.getNext()) {
tokens_.push_back(token.value());
tokens_.emplace_back(token.value());
} else {
tokens_.clear();
return false;
Expand Down
11 changes: 7 additions & 4 deletions velox/functions/prestosql/json/tests/JsonPathTokenizerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,19 @@ using TokenList = std::vector<std::string>;
}

// The test is ported from Presto for compatibility
folly::Expected<TokenList, bool> getTokens(const std::string& path) {
std::optional<TokenList> getTokens(const std::string& path) {
JsonPathTokenizer tokenizer;
tokenizer.reset(path);
if (!tokenizer.reset(path)) {
return std::nullopt;
}

TokenList tokens;
while (tokenizer.hasNext()) {
if (auto token = tokenizer.getNext()) {
tokens.push_back(token.value());
tokens.emplace_back(token.value());
} else {
tokens.clear();
return folly::makeUnexpected(false);
return std::nullopt;
}
}
return tokens;
Expand Down

0 comments on commit 63b4595

Please sign in to comment.