Skip to content

Commit

Permalink
[GlobPattern] Support brace expansions
Browse files Browse the repository at this point in the history
Extend `GlobPattern` to support brace expansions, e.g., `foo.{c,cpp}` as discussed in https://reviews.llvm.org/D152762#4425203.

The high level change was to turn `Tokens` into a list that gets larger when we see a new brace expansion term. Then in `GlobPattern::match()` we must check against each token group.

This is a breaking change since `{` will no longer match a literal without escaping. However, `\{` will match the literal `{` before and after this change. Also, from a brief survey of LLVM, it seems that `GlobPattern` is mostly used for symbol and path matching, which likely won't need `{` in their patterns.

See https://github.com/devongovett/glob-match#syntax for a nice glob reference.

Reviewed By: MaskRay

Differential Revision: https://reviews.llvm.org/D153587
  • Loading branch information
ellishg committed Aug 30, 2023
1 parent abacab6 commit 8daace8
Show file tree
Hide file tree
Showing 3 changed files with 253 additions and 34 deletions.
69 changes: 55 additions & 14 deletions llvm/include/llvm/Support/GlobPattern.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
//
//===----------------------------------------------------------------------===//
//
// This file implements a glob pattern matcher. The glob pattern is the
// rule used by the shell.
// This file implements a glob pattern matcher.
//
//===----------------------------------------------------------------------===//

Expand All @@ -20,30 +19,72 @@
#include "llvm/Support/Error.h"
#include <optional>

// This class represents a glob pattern. Supported metacharacters
// are "*", "?", "\", "[<chars>]", "[^<chars>]", and "[!<chars>]".
namespace llvm {

/// This class implements a glob pattern matcher similar to the one found in
/// bash, but with some key differences. Namely, that \p "*" matches all
/// characters and does not exclude path separators.
///
/// * \p "?" matches a single character.
/// * \p "*" matches zero or more characters.
/// * \p "[<chars>]" matches one character in the bracket. Character ranges,
/// e.g., \p "[a-z]", and negative sets via \p "[^ab]" or \p "[!ab]" are also
/// supported.
/// * \p "{<glob>,...}" matches one of the globs in the list. Nested brace
/// expansions are not supported. If \p MaxSubPatterns is empty then
/// characters \p "{,}" are treated as literals.
/// * \p "\" escapes the next character so it is treated as a literal.
///
/// Some known edge cases are:
/// * \p "]" is allowed as the first character in a character class, i.e.,
/// \p "[]]" is valid and matches the literal \p "]".
/// * The empty character class, i.e., \p "[]", is invalid.
/// * Empty or singleton brace expansions, e.g., \p "{}", \p "{a}", are invalid.
/// * \p "}" and \p "," that are not inside a brace expansion are taken as
/// literals, e.g., \p ",}" is valid but \p "{" is not.
///
/// For example, \p "*[/\\]foo.{c,cpp}" will match (unix or windows) paths to
/// all files named \p "foo.c" or \p "foo.cpp".
class GlobPattern {
public:
static Expected<GlobPattern> create(StringRef Pat);
/// \param Pat the pattern to match against
/// \param MaxSubPatterns if provided limit the number of allowed subpatterns
/// created from expanding braces otherwise disable
/// brace expansion
static Expected<GlobPattern>
create(StringRef Pat, std::optional<size_t> MaxSubPatterns = {});
/// \returns \p true if \p S matches this glob pattern
bool match(StringRef S) const;

// Returns true for glob pattern "*". Can be used to avoid expensive
// preparation/acquisition of the input for match().
bool isTrivialMatchAll() const { return Prefix.empty() && Pat == "*"; }
bool isTrivialMatchAll() const {
if (!Prefix.empty())
return false;
if (SubGlobs.size() != 1)
return false;
return SubGlobs[0].getPat() == "*";
}

private:
bool matchOne(StringRef Str) const;
StringRef Prefix;

// Brackets with their end position and matched bytes.
struct Bracket {
const char *Next;
BitVector Bytes;
};
SmallVector<Bracket, 0> Brackets;
struct SubGlobPattern {
/// \param Pat the pattern to match against
static Expected<SubGlobPattern> create(StringRef Pat);
/// \returns \p true if \p S matches this glob pattern
bool match(StringRef S) const;
StringRef getPat() const { return StringRef(Pat.data(), Pat.size()); }

StringRef Prefix, Pat;
// Brackets with their end position and matched bytes.
struct Bracket {
size_t NextOffset;
BitVector Bytes;
};
SmallVector<Bracket, 0> Brackets;
SmallVector<char, 0> Pat;
};
SmallVector<SubGlobPattern, 1> SubGlobs;
};
}

Expand Down
121 changes: 112 additions & 9 deletions llvm/lib/Support/GlobPattern.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
//===----------------------------------------------------------------------===//

#include "llvm/Support/GlobPattern.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Errc.h"

Expand Down Expand Up @@ -54,18 +53,115 @@ static Expected<BitVector> expand(StringRef S, StringRef Original) {
return BV;
}

Expected<GlobPattern> GlobPattern::create(StringRef S) {
// Identify brace expansions in S and return the list of patterns they expand
// into.
static Expected<SmallVector<std::string, 1>>
parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) {
SmallVector<std::string> SubPatterns = {S.str()};
if (!MaxSubPatterns || !S.contains('{'))
return SubPatterns;

struct BraceExpansion {
size_t Start;
size_t Length;
SmallVector<StringRef, 2> Terms;
};
SmallVector<BraceExpansion, 0> BraceExpansions;

BraceExpansion *CurrentBE = nullptr;
size_t TermBegin;
for (size_t I = 0, E = S.size(); I != E; ++I) {
if (S[I] == '[') {
I = S.find(']', I + 2);
if (I == std::string::npos)
return make_error<StringError>("invalid glob pattern, unmatched '['",
errc::invalid_argument);
} else if (S[I] == '{') {
if (CurrentBE)
return make_error<StringError>(
"nested brace expansions are not supported",
errc::invalid_argument);
CurrentBE = &BraceExpansions.emplace_back();
CurrentBE->Start = I;
TermBegin = I + 1;
} else if (S[I] == ',') {
if (!CurrentBE)
continue;
CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin));
TermBegin = I + 1;
} else if (S[I] == '}') {
if (!CurrentBE)
continue;
if (CurrentBE->Terms.empty())
return make_error<StringError>(
"empty or singleton brace expansions are not supported",
errc::invalid_argument);
CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin));
CurrentBE->Length = I - CurrentBE->Start + 1;
CurrentBE = nullptr;
} else if (S[I] == '\\') {
if (++I == E)
return make_error<StringError>("invalid glob pattern, stray '\\'",
errc::invalid_argument);
}
}
if (CurrentBE)
return make_error<StringError>("incomplete brace expansion",
errc::invalid_argument);

size_t NumSubPatterns = 1;
for (auto &BE : BraceExpansions) {
if (NumSubPatterns > std::numeric_limits<size_t>::max() / BE.Terms.size()) {
NumSubPatterns = std::numeric_limits<size_t>::max();
break;
}
NumSubPatterns *= BE.Terms.size();
}
if (NumSubPatterns > *MaxSubPatterns)
return make_error<StringError>("too many brace expansions",
errc::invalid_argument);
// Replace brace expansions in reverse order so that we don't invalidate
// earlier start indices
for (auto &BE : reverse(BraceExpansions)) {
SmallVector<std::string> OrigSubPatterns;
std::swap(SubPatterns, OrigSubPatterns);
for (StringRef Term : BE.Terms)
for (StringRef Orig : OrigSubPatterns)
SubPatterns.emplace_back(Orig).replace(BE.Start, BE.Length, Term);
}
return SubPatterns;
}

Expected<GlobPattern>
GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) {
GlobPattern Pat;

// Store the prefix that does not contain any metacharacter.
size_t PrefixSize = S.find_first_of("?*[\\");
size_t PrefixSize = S.find_first_of("?*[{\\");
Pat.Prefix = S.substr(0, PrefixSize);
if (PrefixSize == std::string::npos)
return Pat;
S = S.substr(PrefixSize);

SmallVector<std::string, 1> SubPats;
if (auto Err = parseBraceExpansions(S, MaxSubPatterns).moveInto(SubPats))
return Err;
for (StringRef SubPat : SubPats) {
auto SubGlobOrErr = SubGlobPattern::create(SubPat);
if (!SubGlobOrErr)
return SubGlobOrErr.takeError();
Pat.SubGlobs.push_back(*SubGlobOrErr);
}

return Pat;
}

Expected<GlobPattern::SubGlobPattern>
GlobPattern::SubGlobPattern::create(StringRef S) {
SubGlobPattern Pat;

// Parse brackets.
Pat.Pat = S;
Pat.Pat.assign(S.begin(), S.end());
for (size_t I = 0, E = S.size(); I != E; ++I) {
if (S[I] == '[') {
// ']' is allowed as the first character of a character class. '[]' is
Expand All @@ -83,7 +179,7 @@ Expected<GlobPattern> GlobPattern::create(StringRef S) {
return BV.takeError();
if (Invert)
BV->flip();
Pat.Brackets.push_back(Bracket{S.data() + J + 1, std::move(*BV)});
Pat.Brackets.push_back(Bracket{J + 1, std::move(*BV)});
I = J;
} else if (S[I] == '\\') {
if (++I == E)
Expand All @@ -95,13 +191,20 @@ Expected<GlobPattern> GlobPattern::create(StringRef S) {
}

bool GlobPattern::match(StringRef S) const {
return S.consume_front(Prefix) && matchOne(S);
if (!S.consume_front(Prefix))
return false;
if (SubGlobs.empty() && S.empty())
return true;
for (auto &Glob : SubGlobs)
if (Glob.match(S))
return true;
return false;
}

// Factor the pattern into segments split by '*'. The segment is matched
// sequentianlly by finding the first occurrence past the end of the previous
// match.
bool GlobPattern::matchOne(StringRef Str) const {
bool GlobPattern::SubGlobPattern::match(StringRef Str) const {
const char *P = Pat.data(), *SegmentBegin = nullptr, *S = Str.data(),
*SavedS = S;
const char *const PEnd = P + Pat.size(), *const End = S + Str.size();
Expand All @@ -118,7 +221,7 @@ bool GlobPattern::matchOne(StringRef Str) const {
continue;
} else if (*P == '[') {
if (Brackets[B].Bytes[uint8_t(*S)]) {
P = Brackets[B++].Next;
P = Pat.data() + Brackets[B++].NextOffset;
++S;
continue;
}
Expand All @@ -143,5 +246,5 @@ bool GlobPattern::matchOne(StringRef Str) const {
}
// All bytes in Str have been matched. Return true if the rest part of Pat is
// empty or contains only '*'.
return Pat.find_first_not_of('*', P - Pat.data()) == std::string::npos;
return getPat().find_first_not_of('*', P - Pat.data()) == std::string::npos;
}

0 comments on commit 8daace8

Please sign in to comment.