Revert "[refactor] Move EchoLexer to new SimplerLexer2"

This reverts commit 7bd2132. We ended up not using SimpleLexer2 for data_lang/j8. Even though SimpleLexer has more allocations, I think we need real Tokens (which are currently heap objects) anyway. There is a TODO to give precise error messages for BadBackslash in echo, and I think parse errors in globs too. Like unmatched brackets could have precise location info, for strict_glob.
oilshell · Dec 29, 2023 · 7cca7dc · 7cca7dc
1 parent 7bd2132
commit 7cca7dc
Show file tree

Hide file tree

Showing 6 changed files with 14 additions and 99 deletions.
diff --git a/builtin/io_osh.py b/builtin/io_osh.py
@@ -69,18 +69,17 @@ def Run(self, cmd_val):
 
         if arg.e:
             new_argv = []  # type: List[str]
-            for arg_str in argv:
+            for a in argv:
                 parts = []  # type: List[str]
-                lex = match.EchoLexer(arg_str)
-                pos = 0
+                lex = match.EchoLexer(a)
                 while not backslash_c:
-                    id_, end_pos = lex.Next()
+                    id_, s = lex.Next()
                     if id_ == Id.Eol_Tok:  # Note: This is really a NUL terminator
                         break
 
                     # Note: DummyToken is OK because EvalCStringToken() doesn't have any
                     # syntax errors.
-                    tok = lexer.DummyToken(id_, arg_str[pos:end_pos])
+                    tok = lexer.DummyToken(id_, s)
                     p = word_compile.EvalCStringToken(tok)
 
                     # Unusual behavior: '\c' prints what is there and aborts processing!
@@ -89,7 +88,6 @@ def Run(self, cmd_val):
                         break
 
                     parts.append(p)
-                    pos = end_pos
 
                 new_argv.append(''.join(parts))
                 if backslash_c:  # no more args either

diff --git a/builtin/printf_osh.py b/builtin/printf_osh.py
@@ -291,15 +291,14 @@ def _Format(self, parts, varargs, locs, out):
 
                         c_parts = []  # type: List[str]
                         lex = match.EchoLexer(s)
-                        pos = 0
                         while True:
-                            id_, end_pos = lex.Next()
+                            id_, tok_val = lex.Next()
                             if id_ == Id.Eol_Tok:  # Note: This is really a NUL terminator
                                 break
 
                             # Note: DummyToken is OK because EvalCStringToken() doesn't have
                             # any syntax errors.
-                            tok = lexer.DummyToken(id_, s[pos:end_pos])
+                            tok = lexer.DummyToken(id_, tok_val)
                             p = word_compile.EvalCStringToken(tok)
 
                             # Unusual behavior: '\c' aborts processing!
@@ -308,7 +307,6 @@ def _Format(self, parts, varargs, locs, out):
                                 break
 
                             c_parts.append(p)
-                            pos = end_pos
                         s = ''.join(c_parts)
 
                     elif part.type.id == Id.Format_Time or typ in 'diouxX':

diff --git a/cpp/frontend_match.cc b/cpp/frontend_match.cc
@@ -52,41 +52,6 @@ List<Tuple2<Id_t, BigStr*>*>* SimpleLexer::Tokens() {
   return tokens;
 }
 
-Tuple2<Id_t, int> SimpleLexer2::Next() {
-  int id;
-  int end_pos;
-  match_func_(reinterpret_cast<const unsigned char*>(s_->data_), len(s_), pos_,
-              &id, &end_pos);
-
-  pos_ = end_pos;
-  return Tuple2<Id_t, int>(static_cast<Id_t>(id), end_pos);
-}
-
-List<Tuple2<Id_t, BigStr*>*>* SimpleLexer2::Tokens() {
-  auto tokens = NewList<Tuple2<Id_t, BigStr*>*>();
-  int pos = 0;
-  while (true) {
-    auto tup2 = Next();
-    Id_t id = tup2.at0();
-    int end_pos = tup2.at1();
-
-    if (id == Id::Eol_Tok) {
-      break;
-    }
-    log("pos %d end_pos %d", pos, end_pos);
-
-    int len = end_pos - pos_;
-    BigStr* tok_val = NewStr(len);
-    memcpy(tok_val->data_, s_->data_ + pos_, len);  // copy the list item
-    tok_val->data_[len] = '\0';
-
-    // It's annoying that we have to put it on the heap
-    tokens->append(Alloc<Tuple2<Id_t, BigStr*>>(id, tok_val));
-    pos = end_pos;
-  }
-  return tokens;
-}
-
 SimpleLexer* BraceRangeLexer(BigStr* s) {
   return Alloc<SimpleLexer>(&MatchBraceRangeToken, s);
 }
@@ -95,8 +60,8 @@ SimpleLexer* GlobLexer(BigStr* s) {
   return Alloc<SimpleLexer>(&MatchGlobToken, s);
 }
 
-SimpleLexer2* EchoLexer(BigStr* s) {
-  return Alloc<SimpleLexer2>(&MatchEchoToken, s);
+SimpleLexer* EchoLexer(BigStr* s) {
+  return Alloc<SimpleLexer>(&MatchEchoToken, s);
 }
 
 List<Tuple2<Id_t, BigStr*>*>* HistoryTokens(BigStr* s) {

diff --git a/cpp/frontend_match.h b/cpp/frontend_match.h
@@ -44,36 +44,13 @@ class SimpleLexer {
   int pos_;
 };
 
-class SimpleLexer2 {
- public:
-  SimpleLexer2(MatchFunc match_func, BigStr* s)
-      : match_func_(match_func), s_(s), pos_(0) {
-  }
-
-  Tuple2<Id_t, int> Next();
-  List<Tuple2<Id_t, BigStr*>*>* Tokens();
-
-  static constexpr ObjHeader obj_header() {
-    return ObjHeader::ClassFixed(field_mask(), sizeof(SimpleLexer2));
-  }
-
-  static constexpr uint32_t field_mask() {
-    return maskbit(offsetof(SimpleLexer2, s_));
-  }
-
- private:
-  MatchFunc match_func_;
-  BigStr* s_;
-  int pos_;
-};
-
 //
 // Secondary Lexers
 //
 
 SimpleLexer* BraceRangeLexer(BigStr* s);
 SimpleLexer* GlobLexer(BigStr* s);
-SimpleLexer2* EchoLexer(BigStr* s);
+SimpleLexer* EchoLexer(BigStr* s);
 
 List<Tuple2<Id_t, BigStr*>*>* HistoryTokens(BigStr* s);
 List<Tuple2<Id_t, BigStr*>*>* Ps1Tokens(BigStr* s);

diff --git a/cpp/frontend_match_test.cc b/cpp/frontend_match_test.cc
@@ -30,28 +30,6 @@ TEST lexer_test() {
   PASS();
 }
 
-TEST lexer2_test() {
-  match::SimpleLexer2* lex = match::EchoLexer(StrFromC("hi \\t there \\n"));
-
-  List<Tuple2<Id_t, BigStr*>*>* toks = lex->Tokens();
-  for (int i = 0; i < len(toks); i++) {
-    auto* t = toks->at(i);
-    int id = t->at0();
-    if (id == id__Eol_Tok) {
-      break;
-    }
-    log("id = %d", id);
-    log("val = %s", t->at1()->data_);
-  }
-
-  match::SimpleLexer* lex2 = match::BraceRangeLexer(kEmptyString);
-  auto t = lex2->Next();
-  int id = t.at0();
-  ASSERT_EQ(Id::Eol_Tok, id);
-
-  PASS();
-}
-
 TEST func_test() {
   ASSERT_EQ(Id::BoolUnary_G, match::BracketUnary(StrFromC("-G")));
   ASSERT_EQ(Id::Undefined_Tok, match::BracketUnary(StrFromC("-Gz")));
@@ -99,7 +77,6 @@ int main(int argc, char** argv) {
   GREATEST_MAIN_BEGIN();
 
   RUN_TEST(lexer_test);
-  RUN_TEST(lexer2_test);
   RUN_TEST(func_test);
   RUN_TEST(for_test_coverage);
 

diff --git a/frontend/match.py b/frontend/match.py
@@ -252,10 +252,10 @@ def Tokens(self):
         return tokens
 
 
-# Iterated over in builtin/io_osh.py
+# Iterated over in osh/builtin_pure.py
 def EchoLexer(s):
-    # type: (str) -> SimpleLexer2
-    return SimpleLexer2(ECHO_MATCHER, s)
+    # type: (str) -> SimpleLexer
+    return SimpleLexer(ECHO_MATCHER, s)
 
 
 def BraceRangeLexer(s):
@@ -283,13 +283,13 @@ def J8StrLexer(s):
 
 def HistoryTokens(s):
     # type: (str) -> List[Tuple[Id_t, str]]
-    lex = SimpleLexer2(HISTORY_MATCHER, s)
+    lex = SimpleLexer(HISTORY_MATCHER, s)
     return lex.Tokens()
 
 
 def Ps1Tokens(s):
     # type: (str) -> List[Tuple[Id_t, str]]
-    lex = SimpleLexer2(PS1_MATCHER, s)
+    lex = SimpleLexer(PS1_MATCHER, s)
     return lex.Tokens()