From beee3335ba98a64c3388b1d655542bdcd623ca8f Mon Sep 17 00:00:00 2001 From: matyalatte Date: Sun, 1 Sep 2024 19:41:30 +0900 Subject: [PATCH] support glob patterns for --exclude option --- include/glob_match.h | 17 +++++ include/options.h | 3 +- meson.build | 1 + src/glob_match.cpp | 155 +++++++++++++++++++++++++++++++++++++++++++ src/options.cpp | 27 +++----- tests/glob_test.cpp | 88 ++++++++++++++++++++++++ tests/meson.build | 1 + 7 files changed, 275 insertions(+), 17 deletions(-) create mode 100644 include/glob_match.h create mode 100644 src/glob_match.cpp create mode 100644 tests/glob_test.cpp diff --git a/include/glob_match.h b/include/glob_match.h new file mode 100644 index 0000000..76f4d7b --- /dev/null +++ b/include/glob_match.h @@ -0,0 +1,17 @@ +#pragma once +#include +#include "common.h" +#include "regex_utils.h" + +class GlobPattern { + private: + regex_code m_re_pattern; + + public: + // When match_with_parent is true, + // GlobPattern::Match() checks parent paths as well. + explicit GlobPattern(const std::string& glob_pattern, bool match_with_parent = false); + + // Returns if a path matches with a glob pattern or not. + [[nodiscard]] bool Match(const std::string& path) const; +}; diff --git a/include/options.h b/include/options.h index 55f5301..33ce5e5 100644 --- a/include/options.h +++ b/include/options.h @@ -4,6 +4,7 @@ #include #include #include "cpplint_state.h" +#include "glob_match.h" namespace fs = std::filesystem; @@ -81,7 +82,7 @@ class Options { // Filters out files listed in the --exclude command line switch. File paths // in the switch are evaluated relative to the current working directory std::vector FilterExcludedFiles(std::vector filenames, - const std::vector& excludes); + const std::vector& excludes); public: Options() : diff --git a/meson.build b/meson.build index d1ec5d2..a2a427f 100644 --- a/meson.build +++ b/meson.build @@ -172,6 +172,7 @@ cpplint_sources = [ 'src/cleanse.cpp', 'src/states.cpp', 'src/nest_info.cpp', + 'src/glob_match.cpp', ] # main binary diff --git a/src/glob_match.cpp b/src/glob_match.cpp new file mode 100644 index 0000000..9365518 --- /dev/null +++ b/src/glob_match.cpp @@ -0,0 +1,155 @@ +#include "glob_match.h" +#include +#include +#include +#include +#include "regex_utils.h" + +// We use modified version of glob.cpp to convert glob patterns to regex patterns. +// https://github.com/p-ranav/glob/blob/master/source/glob.cpp + +static constexpr auto SPECIAL_CHARACTERS = std::string_view{"()[]{}?*+-|^$\\.&~# \t\n\r\v\f"}; +static const auto ESCAPE_SET_OPER = RegexCompile(R"([&~|])"); +static const auto ESCAPE_REPL_STR = std::string{R"(\\\1)"}; + +static bool string_replace(std::string &str, std::string_view from, std::string_view to) { + std::size_t start_pos = str.find(from); + if (start_pos == std::string::npos) + return false; + str.replace(start_pos, from.length(), to); + return true; +} + +// Convert a glob pattern to a regex pattern. +// When match_with_parent is true, +// returned regex pattern will match with parent paths as well. +static std::string translate(std::string_view pattern, bool match_with_parent) { + std::size_t i = 0, n = pattern.size(); + std::string result_string; + + while (i < n) { + auto c = pattern[i]; + i += 1; + if (c == '*') { + if ((i <= 1 || pattern[i - 2] == '/' || pattern[i - 2] == '\\') && + i < n && pattern[i] == '*' && + (i + 1 == n || pattern[i + 1] == '/' || pattern[i + 1] == '\\')) { + if (i + 1 == n) { + result_string += ".*"; + break; + } else { + result_string += R"((.*[\\/])?)"; + i += 2; + } + } else { + result_string += R"([^\\/]*)"; + } + } else if (c == '?') { + result_string += R"([^\\/])"; + } else if (c == '[') { + auto j = i; + if (j < n && pattern[j] == '!') { + j += 1; + } + if (j < n && pattern[j] == ']') { + j += 1; + } + while (j < n && pattern[j] != ']') { + j += 1; + } + if (j >= n) { + result_string += "\\["; + } else { + auto stuff = std::string(pattern.begin() + i, pattern.begin() + j); + if (stuff.find("--") == std::string::npos) { + string_replace(stuff, std::string_view{"\\"}, std::string_view{R"(\\)"}); + } else { + std::vector chunks; + std::size_t k = 0; + if (pattern[i] == '!') { + k = i + 2; + } else { + k = i + 1; + } + + while (true) { + k = pattern.find("-", k, j); + if (k == std::string_view::npos) { + break; + } + chunks.push_back(std::string(pattern.begin() + i, pattern.begin() + k)); + i = k + 1; + k = k + 3; + } + + chunks.push_back(std::string(pattern.begin() + i, pattern.begin() + j)); + // Escape backslashes and hyphens for set difference (--). + // Hyphens that create ranges shouldn't be escaped. + bool first = true; + for (auto &chunk : chunks) { + string_replace(chunk, std::string_view{"\\"}, std::string_view{R"(\\)"}); + string_replace(chunk, std::string_view{"-"}, std::string_view{R"(\-)"}); + if (first) { + stuff += chunk; + first = false; + } else { + stuff += "-" + chunk; + } + } + } + + // Escape set operations (&&, ~~ and ||). + RegexReplace(ESCAPE_SET_OPER, ESCAPE_REPL_STR, stuff); + i = j + 1; + if (stuff[0] == '!') { + stuff = R"(^\\/)" + std::string(stuff.begin() + 1, stuff.end()); + } else if (stuff[0] == '^' || stuff[0] == '[') { + stuff = "\\\\" + stuff; + } + result_string = result_string + "[" + stuff + "]"; + } + } else if (c == '/' || c == '\\') { + // Path separator + result_string += R"([\\/])"; + } else { + // SPECIAL_CHARS + // closing ')', '}' and ']' + // '-' (a range in character set) + // '&', '~', (extended character set operations) + // '#' (comment) and WHITESPACE (ignored) in verbose mode + static std::map special_characters_map; + if (special_characters_map.empty()) { + for (auto &&sc : SPECIAL_CHARACTERS) { + special_characters_map.emplace( + static_cast(sc), std::string("\\") + std::string(1, sc)); + } + } + + if (SPECIAL_CHARACTERS.find(c) != std::string_view::npos) { + result_string += special_characters_map[static_cast(c)]; + } else { + result_string += c; + } + } + } + + if (match_with_parent) { + // GlobPattern::Match() should check parent paths as well. + char c = pattern.back(); + if (c != '\\' && c != '/') + result_string += R"(([\\/].*)?$)"; + } else { + result_string.push_back('$'); + } + return result_string; +} + +GlobPattern::GlobPattern(const std::string& glob_pattern, bool match_with_parent) { + std::string re_pattern_str = translate(glob_pattern, match_with_parent); + m_re_pattern = RegexCompile(re_pattern_str); +} + +bool GlobPattern::Match(const std::string& path) const { + regex_match re_result = RegexCreateMatchData(m_re_pattern); + return RegexMatch(m_re_pattern, path, re_result); +} diff --git a/src/options.cpp b/src/options.cpp index 8309af6..af37763 100644 --- a/src/options.cpp +++ b/src/options.cpp @@ -13,6 +13,7 @@ #include #include "cpplint_state.h" #include "error_suppressions.h" +#include "glob_match.h" #include "regex_utils.h" #include "string_utils.h" #include "version.h" @@ -353,7 +354,7 @@ std::vector Options::ParseArguments(int argc, char** argv, bool quiet = cpplint_state->Quiet(); std::string counting_style = ""; bool recursive = false; - std::vector excludes = {}; + std::vector excludes = {}; int num_threads = -1; m_filters = DEFAULT_FILTERS; @@ -416,7 +417,8 @@ std::vector Options::ParseArguments(int argc, char** argv, std::string val = ArgToValue(opt); if (val != "") { excludes.emplace_back( - fs::weakly_canonical(fs::absolute(val)).make_preferred()); + fs::weakly_canonical(fs::absolute(val)).make_preferred().string(), + true); } } else if (opt.starts_with("--extensions=")) { ProcessExtensionsOption(ArgToValue(opt)); @@ -501,28 +503,21 @@ void Options::ProcessIncludeOrderOption(const std::string& val) { } static bool ShouldBeExcluded(const fs::path& filename, - const std::vector& excludes) { - for (const fs::path& exc : excludes) { - // TODO(matyalatte): support glob patterns for --exclude - if (filename == exc) // same path - return true; - - // Check if exc is a parent path of filename - std::string exc_str = exc.string(); - if (exc_str.back() != fs::path::preferred_separator) { - exc_str += fs::path::preferred_separator; - } - if (StrContain(filename.string(), exc_str)) + const std::vector& excludes) { + std::string file_str = filename.string(); + for (const GlobPattern& exc : excludes) { + // Check if file is the same as (or a child of) a glob pattern + if (exc.Match(file_str)) return true; } return false; } std::vector Options::FilterExcludedFiles(std::vector filenames, - const std::vector& excludes) { + const std::vector& excludes) { // remove matching exclude patterns from m_filenames auto new_end = std::remove_if(filenames.begin(), filenames.end(), - [excludes](const fs::path& f)->bool { + [&excludes](const fs::path& f)->bool { return ShouldBeExcluded(f, excludes); }); filenames.erase(new_end, filenames.end()); diff --git a/tests/glob_test.cpp b/tests/glob_test.cpp new file mode 100644 index 0000000..c52c981 --- /dev/null +++ b/tests/glob_test.cpp @@ -0,0 +1,88 @@ +#include +#include +#include "glob_match.h" + +struct GlobCase { + const char* pattern; + const std::string str; + bool expected; + // expected value when parent matching is enabled. + bool expected_parent; +}; + +class GlobMatchTest : public ::testing::TestWithParam { +}; + +const GlobCase glob_cases[] = { + // literal + { "/foo/bar.h", "/foo/bar.h", true, true }, + { "/foo/bar.h", "/foo/bar-h", false, false }, + // any characters + { "/foo/*h", "/foo/bar-h", true, true }, + { "/foo/bar/*h", "/foo/bar-h", false, false }, + { "/foo/bar/*h", "/foo/bar/test.h", true, true }, + { "/*/test.h", "/foo/test.h", true, true }, + { "/*/test.h", "/foo/bar/test.h", false, false }, + { "foo/*", "foo/test.h", true, true }, + { "foo/*", "foo/bar/test.h", false, true }, + // recursive + { "/**/test.h", "/foo/test.h", true, true }, + { "/**/test.h", "/foo/bar/test.h", true, true }, + { "/**bar/test.h", "/foo/bar/test.h", false, false }, + { "**/test.h", "/foo/test.h", true, true }, + { "**/test.h", "/foo/bar/test.h", true, true }, + { "**/test.h", "test.h", true, true }, + { "**bar/test.h", "/foo/bar/test.h", false, false }, + { "foo/**", "foo/test.h", true, true }, + { "foo/**", "foo/bar/test.h", true, true }, + // any single character + { "/foo/bar?h", "/foo/bar.h", true, true }, + { "/foo/bar?h", "/foo/bar..h", false, false }, + { "/foo/bar?h", "/foo/bar/h", false, false }, + // list + { "/foo/[abc].h", "/foo/b.h", true, true }, + { "/foo/[abc].h", "/foo/d.h", false, false }, + // negative list + { "/foo/[!abc].h", "/foo/b.h", false, false }, + { "/foo/[!abc].h", "/foo/d.h", true, true }, + { "/foo/[!abc].h", "/foo//.h", false, false }, + // range + { "/foo/[a-c].h", "/foo/b.h", true, true }, + { "/foo/[a-c].h", "/foo/d.h", false, false }, + // negative range + { "/foo/[!a-c].h", "/foo/b.h", false, false }, + { "/foo/[!a-c].h", "/foo/d.h", true, true }, + { "/foo/[!a-c].h", "/foo//.h", false, false }, + // compare with parent matching + { "/foo/bar", "/foo/bar/baz", false, true }, + { "/foo/*", "/foo/bar/baz", false, true }, + { "/foo/*/", "/foo/bar/baz", false, true }, + { "/foo/**/test", "/foo/bar/baz/test/a.cpp", false, true }, + // windows paths + { "C:/foo/bar.h", "C:\\foo\\bar.h", true, true }, + { "C:/foo/bar", "C:\\foo\\bar\\baz", false, true }, + { "C:\\foo\\bar", "C:/foo/bar/baz", false, true }, +}; + +INSTANTIATE_TEST_SUITE_P(GlobMatchTestInstantiation, + GlobMatchTest, + ::testing::ValuesIn(glob_cases)); + +TEST_P(GlobMatchTest, GlobMatch) { + const GlobCase test_case = GetParam(); + GlobPattern glob(test_case.pattern); + bool match = glob.Match(test_case.str); + EXPECT_EQ(test_case.expected, match) << + " pattern: " << test_case.pattern << "\n" << + " str: " << test_case.str; +} + +TEST_P(GlobMatchTest, GlobMatchParentMatching) { + const GlobCase test_case = GetParam(); + // enable parent matching + GlobPattern glob(test_case.pattern, true); + bool match = glob.Match(test_case.str); + EXPECT_EQ(test_case.expected_parent, match) << + " pattern: " << test_case.pattern << "\n" << + " str: " << test_case.str; +} diff --git a/tests/meson.build b/tests/meson.build index 6c60efc..09abf08 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -14,6 +14,7 @@ test_sources = [ 'string_test.cpp', 'lines_test.cpp', 'file_test.cpp', + 'glob_test.cpp', ] # build tests