Skip to content

Conversation

@vitalybuka
Copy link
Collaborator

@vitalybuka vitalybuka commented Oct 21, 2025

Finds longest (almost) plain substring in the pattern.

Implementation is conservative to avoid false positives.

The result is not used to optimize
GlobPattern::match() so it's calculated on
request.

For

Created using spr 1.3.6
@llvmbot
Copy link
Member

llvmbot commented Oct 21, 2025

@llvm/pr-subscribers-llvm-support

Author: Vitaly Buka (vitalybuka)

Changes

Finds longest (almost) plain substring in the pattern.

Implementation is conservative to avoid false positives.

The result is not used to optimize
GlobPattern::match() so it's calculated on
request.


Full diff: https://github.com/llvm/llvm-project/pull/164512.diff

3 Files Affected:

  • (modified) llvm/include/llvm/Support/GlobPattern.h (+15-7)
  • (modified) llvm/lib/Support/GlobPattern.cpp (+49-8)
  • (modified) llvm/unittests/Support/GlobPatternTest.cpp (+58)
diff --git a/llvm/include/llvm/Support/GlobPattern.h b/llvm/include/llvm/Support/GlobPattern.h
index c1b44849b9794..4824f3fa01e5b 100644
--- a/llvm/include/llvm/Support/GlobPattern.h
+++ b/llvm/include/llvm/Support/GlobPattern.h
@@ -63,22 +63,30 @@ class GlobPattern {
   // Returns true for glob pattern "*". Can be used to avoid expensive
   // preparation/acquisition of the input for match().
   bool isTrivialMatchAll() const {
-    if (!Prefix.empty())
+    if (PrefixSize)
       return false;
-    if (!Suffix.empty())
+    if (SuffixSize)
       return false;
     if (SubGlobs.size() != 1)
       return false;
     return SubGlobs[0].getPat() == "*";
   }
 
-  StringRef prefix() const { return Prefix; }
-  StringRef suffix() const { return Suffix; }
+  // The followind functions are as shortcuts to some matching. They are
+  // conservative to simplify implementations.
 
-private:
-  StringRef Prefix;
-  StringRef Suffix;
+  // Returns plain prefix of the pattern.
+  StringRef prefix() const { return Pattern.take_front(PrefixSize); }
+  // Returns plain suffix of the pattern.
+  StringRef suffix() const { return Pattern.take_back(SuffixSize); }
+  // Returns the longest plain substring of the pattern between of prefix and
+  // suffix.
+  StringRef longest_substr() const;
 
+private:
+  StringRef Pattern;
+  size_t PrefixSize = 0;
+  size_t SuffixSize = 0;
   struct SubGlobPattern {
     /// \param Pat the pattern to match against
     LLVM_ABI static Expected<SubGlobPattern> create(StringRef Pat);
diff --git a/llvm/lib/Support/GlobPattern.cpp b/llvm/lib/Support/GlobPattern.cpp
index 0ecf47dc1d3d1..dfc1508ce63af 100644
--- a/llvm/lib/Support/GlobPattern.cpp
+++ b/llvm/lib/Support/GlobPattern.cpp
@@ -132,24 +132,60 @@ parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) {
   return std::move(SubPatterns);
 }
 
+static StringRef maxPlainSubstring(StringRef S) {
+  StringRef R;
+  while (!S.empty()) {
+    size_t PrefixSize = S.find_first_of("?*[{\\");
+    if (PrefixSize == std::string::npos)
+      PrefixSize = S.size();
+
+    if (R.size() < PrefixSize)
+      R = S.take_front(PrefixSize);
+    S = S.drop_front(PrefixSize);
+
+    switch (S.front()) {
+    case '\\':
+      S = S.drop_front(2);
+      break;
+    case '[': {
+      size_t EndBracket = S.find_first_of("]");
+      if (EndBracket == std::string::npos)
+        return R; // Incorrect, but let SubGlobPattern::create handle it.
+      S = S.drop_front(EndBracket + 1);
+      break;
+    }
+    case '{':
+      // TODO: implement.
+      return {};
+    default:
+      S = S.drop_front(1);
+    }
+  }
+
+  return R;
+}
+
 Expected<GlobPattern>
 GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) {
   GlobPattern Pat;
+  Pat.Pattern = S;
 
   // Store the prefix that does not contain any metacharacter.
-  size_t PrefixSize = S.find_first_of("?*[{\\");
-  Pat.Prefix = S.substr(0, PrefixSize);
-  if (PrefixSize == std::string::npos)
+  Pat.PrefixSize = S.find_first_of("?*[{\\");
+  if (Pat.PrefixSize == std::string::npos) {
+    Pat.PrefixSize = S.size();
     return Pat;
-  S = S.substr(PrefixSize);
+  }
+  S = S.substr(Pat.PrefixSize);
 
   // Just in case we stop on unmatched opening brackets.
   size_t SuffixStart = S.find_last_of("?*[]{}\\");
   assert(SuffixStart != std::string::npos);
   if (S[SuffixStart] == '\\')
     ++SuffixStart;
-  ++SuffixStart;
-  Pat.Suffix = S.substr(SuffixStart);
+  if (SuffixStart < S.size())
+    ++SuffixStart;
+  Pat.SuffixSize = S.size() - SuffixStart;
   S = S.substr(0, SuffixStart);
 
   SmallVector<std::string, 1> SubPats;
@@ -199,10 +235,15 @@ GlobPattern::SubGlobPattern::create(StringRef S) {
   return Pat;
 }
 
+StringRef GlobPattern::longest_substr() const {
+  return maxPlainSubstring(
+      Pattern.drop_front(PrefixSize).drop_back(SuffixSize));
+}
+
 bool GlobPattern::match(StringRef S) const {
-  if (!S.consume_front(Prefix))
+  if (!S.consume_front(prefix()))
     return false;
-  if (!S.consume_back(Suffix))
+  if (!S.consume_back(suffix()))
     return false;
   if (SubGlobs.empty() && S.empty())
     return true;
diff --git a/llvm/unittests/Support/GlobPatternTest.cpp b/llvm/unittests/Support/GlobPatternTest.cpp
index 58fd7678131c6..a0e0d1415f383 100644
--- a/llvm/unittests/Support/GlobPatternTest.cpp
+++ b/llvm/unittests/Support/GlobPatternTest.cpp
@@ -329,6 +329,64 @@ TEST_F(GlobPatternTest, PrefixSuffix) {
   EXPECT_EQ("cd", Pat->suffix());
 }
 
+TEST_F(GlobPatternTest, Substr) {
+  auto Pat = GlobPattern::create("");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("", Pat->longest_substr());
+
+  Pat = GlobPattern::create("abcd");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcd");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("", Pat->longest_substr());
+
+  Pat = GlobPattern::create("*abcd");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("", Pat->longest_substr());
+
+  Pat = GlobPattern::create("abcd*");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bc*d");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bc", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bc*def*g");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("def", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcd*ef*g");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bcd", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcd*efg*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bcd", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcd[ef]g*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bcd", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcde\\fg*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bcde", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcde\\[fg*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bcde", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcde?fg*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bcde", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcdef{g}*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("", Pat->longest_substr());
+}
+
 TEST_F(GlobPatternTest, Pathological) {
   std::string P, S(40, 'a');
   StringRef Pieces[] = {"a*", "[ba]*", "{b*,a*}*"};

Copy link
Contributor

Copilot AI left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pull Request Overview

This PR adds a new longest_substr() method to GlobPattern that finds the longest plain (non-wildcard) substring within a glob pattern, excluding the prefix and suffix. The implementation is conservative to avoid false positives and is calculated on-demand rather than during pattern matching.

Key Changes:

  • Added longest_substr() method to find the longest plain substring between wildcards
  • Refactored internal storage from StringRef Prefix/Suffix to store original pattern with size offsets
  • Added comprehensive test coverage for various glob pattern scenarios

Reviewed Changes

Copilot reviewed 3 out of 3 changed files in this pull request and generated 3 comments.

File Description
llvm/include/llvm/Support/GlobPattern.h Added longest_substr() declaration and refactored internal storage to use pattern with size offsets
llvm/lib/Support/GlobPattern.cpp Implemented maxPlainSubstring() helper and longest_substr() method, updated pattern storage
llvm/unittests/Support/GlobPatternTest.cpp Added comprehensive test cases for longest_substr() functionality

Created using spr 1.3.6

[skip ci]
Created using spr 1.3.6
Created using spr 1.3.6

[skip ci]
Created using spr 1.3.6
Created using spr 1.3.7

[skip ci]
Created using spr 1.3.7
@vitalybuka vitalybuka changed the base branch from main to users/vitalybuka/spr/main.globpattern-add-globpatternlongest_substr-2 October 21, 2025 23:48
@vitalybuka vitalybuka requested a review from Copilot October 21, 2025 23:49
Copy link
Contributor

Copilot AI left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pull Request Overview

Copilot reviewed 3 out of 3 changed files in this pull request and generated 3 comments.

Created using spr 1.3.7
@vitalybuka vitalybuka requested a review from qinkunbao October 21, 2025 23:54
Created using spr 1.3.7
Created using spr 1.3.7
@vitalybuka vitalybuka changed the title [GlobPattern] Add GlobPattern::longest_substr(). [NFC][GlobPattern] Add GlobPattern::longest_substr() Oct 22, 2025
Created using spr 1.3.7

[skip ci]
Created using spr 1.3.7
vitalybuka added a commit that referenced this pull request Oct 23, 2025
Replace two StringRefs with One StringRef + 2 x size_t.

Prepare for:
* #164512
llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Oct 23, 2025
…n (#164513)

Replace two StringRefs with One StringRef + 2 x size_t.

Prepare for:
* llvm/llvm-project#164512
lukel97 and others added 2 commits October 22, 2025 20:10
Created using spr 1.3.7

[skip ci]
Created using spr 1.3.7
@vitalybuka vitalybuka changed the base branch from users/vitalybuka/spr/main.globpattern-add-globpatternlongest_substr-2 to main October 23, 2025 03:10
Created using spr 1.3.7
@vitalybuka vitalybuka enabled auto-merge (squash) October 23, 2025 03:12
@vitalybuka vitalybuka merged commit 6fdef0b into main Oct 23, 2025
9 of 10 checks passed
@vitalybuka vitalybuka deleted the users/vitalybuka/spr/globpattern-add-globpatternlongest_substr branch October 23, 2025 03:46
mikolaj-pirog pushed a commit to mikolaj-pirog/llvm-project that referenced this pull request Oct 23, 2025
)

Replace two StringRefs with One StringRef + 2 x size_t.

Prepare for:
* llvm#164512
mikolaj-pirog pushed a commit to mikolaj-pirog/llvm-project that referenced this pull request Oct 23, 2025
Finds longest (almost) plain substring in the pattern.

Implementation is conservative to avoid false positives.

The result is not used to optimize
`GlobPattern::match()` so it's calculated on
request.

For
* llvm#164545

---------

Co-authored-by: Luke Lau <luke@igalia.com>
dvbuka pushed a commit to dvbuka/llvm-project that referenced this pull request Oct 27, 2025
)

Replace two StringRefs with One StringRef + 2 x size_t.

Prepare for:
* llvm#164512
dvbuka pushed a commit to dvbuka/llvm-project that referenced this pull request Oct 27, 2025
Finds longest (almost) plain substring in the pattern.

Implementation is conservative to avoid false positives.

The result is not used to optimize
`GlobPattern::match()` so it's calculated on
request.

For
* llvm#164545

---------

Co-authored-by: Luke Lau <luke@igalia.com>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants