[ysh language] Parse most of YSH case statement (#1633)

The pattern can be: - a list of words - a list of expressions - an eggex - (else) if nothing matches We have some tricky/hacky logic in _NewlineOkForYshCase() due to overly intimate CommandParser -> WordParser -> Lexer interaction. --------- Co-authored-by: Andy Chu <andy@oilshell.org>
oilshell · Jun 3, 2023 · c0e91d4 · c0e91d4
1 parent e4387c6
commit c0e91d4
Show file tree

Hide file tree

Showing 12 changed files with 516 additions and 47 deletions.
diff --git a/core/test_lib.py b/core/test_lib.py
@@ -349,7 +349,11 @@ def InitWordParser(word_str, oil_at=False, arena=None):
 
 def InitCommandParser(code_str, arena=None):
     arena = arena or MakeArena('<test_lib>')
-    parse_ctx = InitParseContext(arena=arena)
+
+    loader = pyutil.GetResourceLoader()
+    oil_grammar = pyutil.LoadOilGrammar(loader)
+
+    parse_ctx = InitParseContext(arena=arena, oil_grammar=oil_grammar)
     line_reader, _ = InitLexer(code_str, arena)
     c_parser = parse_ctx.MakeOshParser(line_reader)
     return c_parser

diff --git a/frontend/lexer.py b/frontend/lexer.py
@@ -119,6 +119,11 @@ def LookAheadOne(self, lex_mode):
       tok_type, _ = match.OneToken(lex_mode, line_str, pos)
       return tok_type
 
+  def AssertAtEndOfLine(self):
+    # type: () -> None
+    assert self.line_pos == len(self.src_line.content), \
+        '%d %s' % (self.line_pos, self.src_line.content)
+
   def LookPastSpace(self, lex_mode):
     # type: (lex_mode_t) -> Id_t
     """Look ahead in current line for non-space token, using given lexer mode.
@@ -148,8 +153,9 @@ def LookPastSpace(self, lex_mode):
 
       # NOTE: Instead of hard-coding this token, we could pass it in.
       # LookPastSpace(lex_mode, past_token_type)
-      # WS_Space only appears in the ShCommand state! 
-      if tok_type != Id.WS_Space:
+      # - WS_Space only given in lex_mode_e.ShCommand
+      # - Id.Ignored_Space given in lex_mode_e.Expr
+      if tok_type != Id.WS_Space and tok_type != Id.Ignored_Space:
         break
       pos = end_pos
 
@@ -320,6 +326,35 @@ def PushHint(self, old_id, new_id):
     #log('   PushHint %s ==> %s', Id_str(old_id), Id_str(new_id))
     self.translation_stack.append((old_id, new_id))
 
+  def MoveToNextLine(self):
+    # type: () -> None
+    """For lookahead on the next line.
+
+    This is required by `ParseYshCase` and is used in `_NewlineOkForYshCase`.
+
+    We use this because otherwise calling `LookPastSpace` would return
+    `Id.Unknown_Tok` when the lexer has reached the end of the line. For an
+    example, take this case:
+
+      case (x) {
+               ^--- We are here
+
+        (else) {
+        ^--- We want lookahead to here
+
+            echo test
+        }
+      }
+
+    But, without `MoveToNextLine`, it is impossible to peek the '(' without
+    consuming it. And consuming it would be a problem once we want to hand off
+    pattern parsing to the expression parser.
+    """
+    self.line_lexer.AssertAtEndOfLine() # Only call this when you've seen \n
+
+    src_line, line_pos = self.line_reader.GetLine()
+    self.line_lexer.Reset(src_line, line_pos)  # fill with a new line
+
   def _Read(self, lex_mode):
     # type: (lex_mode_t) -> Token
     """Read from the normal line buffer, not an alias."""

diff --git a/frontend/lexer_test.py b/frontend/lexer_test.py
@@ -5,7 +5,7 @@
 
 import unittest
 
-from _devbuild.gen.id_kind_asdl import Id
+from _devbuild.gen.id_kind_asdl import Id, Id_str
 from _devbuild.gen.types_asdl import lex_mode_e
 from core import test_lib
 from core.test_lib import Tok
@@ -32,6 +32,19 @@ def _PrintfOuterTokens(fmt):
   log('')
 
 
+def _PrintToken(t):
+  #print(t)
+  print('%20s %r' % (Id_str(t.id), t.tval))
+
+
+def _PrintAllTokens(lx, lex_mode):
+  while True:
+    t = lx.Read(lex_mode)
+    _PrintToken(t)
+    if t.id in (Id.Eof_Real, Id.Eol_Tok):
+      break
+
+
 class TokenTest(unittest.TestCase):
 
   def testToken(self):
@@ -57,6 +70,70 @@ def testPrintStats(self):
     print("Number of lex states: %d" % len(LEXER_DEF))
     print("Number of token dispatches: %d" % total)
 
+  def testMoveToNextLine(self):
+    """
+    Test that it doesn't mess up invariants
+    """
+    arena = test_lib.MakeArena('<lexer_test.py>')
+    code_str = '''cd {
+}'''
+
+    print('=== Printing all tokens')
+    if 1:
+      _, lx = test_lib.InitLexer(code_str, arena)
+      _PrintAllTokens(lx, lex_mode_e.ShCommand)
+
+    print()
+    print('=== MoveToNextLine() and LookAheadOne()')
+    _, lx = test_lib.InitLexer(code_str, arena)
+
+    t = lx.Read(lex_mode_e.ShCommand)
+    _PrintToken(t)
+    self.assertEqual(Id.Lit_Chars, t.id)
+
+    t = lx.Read(lex_mode_e.ShCommand)
+    _PrintToken(t)
+    self.assertEqual(Id.WS_Space, t.id)
+
+    t = lx.Read(lex_mode_e.ShCommand)
+    _PrintToken(t)
+    self.assertEqual(Id.Lit_LBrace, t.id)
+
+    try:
+      lx.MoveToNextLine()
+    except AssertionError:
+      pass
+    else:
+      self.fail('Should have asserted')
+
+    t = lx.Read(lex_mode_e.ShCommand)
+    _PrintToken(t)
+    self.assertEqual(Id.Op_Newline, t.id)
+
+    look_ahead_id = lx.LookAheadOne(lex_mode_e.ShCommand)
+    self.assertEqual(Id.Unknown_Tok, look_ahead_id)
+
+    # Method being tested
+    lx.MoveToNextLine()
+
+    # Lookahead
+    print('Lookahead')
+    look_ahead_id = lx.LookAheadOne(lex_mode_e.ShCommand)
+    self.assertEqual(Id.Lit_RBrace, look_ahead_id)
+
+    # Lookahead again
+    print('Lookahead 2')
+    look_ahead_id = lx.LookAheadOne(lex_mode_e.ShCommand)
+    self.assertEqual(Id.Lit_RBrace, look_ahead_id)
+
+    t = lx.Read(lex_mode_e.ShCommand)
+    _PrintToken(t)
+    self.assertEqual(Id.Lit_RBrace, t.id)
+
+    t = lx.Read(lex_mode_e.ShCommand)
+    _PrintToken(t)
+    self.assertEqual(Id.Eof_Real, t.id)
+
   def testMaybeUnreadOne(self):
     arena = test_lib.MakeArena('<lexer_test.py>')
     _, lx = test_lib.InitLexer('()', arena)

diff --git a/frontend/parse_lib.py b/frontend/parse_lib.py
@@ -4,7 +4,7 @@
 
 from _devbuild.gen.id_kind_asdl import Id, Id_t
 from _devbuild.gen.syntax_asdl import (
-    Token, CompoundWord, expr_t, word_t, Redir, ArgList, NameType, command
+    Token, CompoundWord, expr_t, word_t, Redir, ArgList, NameType, command, pat_t
 )
 from _devbuild.gen.types_asdl import lex_mode_e
 from _devbuild.gen import grammar_nt
@@ -392,6 +392,17 @@ def ParseYshForExpr(self, lexer, start_symbol):
 
     return lvalue, iterable, last_token
 
+  def ParseYshCasePattern(self, lexer):
+      # type: (Lexer) -> pat_t
+      """ (6) | (7), / dot* '.py' /, (else), etc """
+      e_parser = self._YshParser()
+      with ctx_PNodeAllocator(e_parser):
+          pnode, _last_token = e_parser.Parse(lexer, grammar_nt.case_pat)
+
+          pattern = self.tr.YshCasePattern(pnode)
+
+      return pattern
+
   def ParseProc(self, lexer, out):
     # type: (Lexer, command.Proc) -> Token
     """ proc f(x, y, @args) { """

diff --git a/frontend/syntax.asdl b/frontend/syntax.asdl
@@ -266,7 +266,7 @@ module syntax
   # For YSH typed case, left can be ( and /
   # And case_pat may contain more details
   CaseArm = (
-      Token left, List[word] pat_list, Token middle, List[command] action,
+      Token left, pat pattern, Token middle, List[command] action,
       Token? right
   )
 
@@ -277,6 +277,12 @@ module syntax
     Word(word w)
   | YshExpr(expr e)
 
+  pat =
+    Else
+  | Words(List[word] words)
+  | YshExprs(List[expr] exprs)
+  | Eggex(re eggex)
+
   # Each if arm starts with either an "if" or "elif" keyword
   # In YSH, the then keyword is not used (replaced by braces {})
   IfArm = (

diff --git a/osh/cmd_eval.py b/osh/cmd_eval.py
@@ -31,6 +31,7 @@
     proc_sig, proc_sig_e,
     redir_param, redir_param_e,
     for_iter, for_iter_e,
+    pat, pat_e,
 )
 from _devbuild.gen.runtime_asdl import (
     lvalue, lvalue_e,
@@ -1419,7 +1420,13 @@ def _Dispatch(self, node, cmd_st):
         done = False
 
         for case_arm in node.arms:
-          for pat_word in case_arm.pat_list:
+          if case_arm.pattern.tag() != pat_e.Words:
+            # TODO: support more than pat.Words
+            raise NotImplementedError()
+
+          pat_words = cast(pat.Words, case_arm.pattern)
+
+          for pat_word in pat_words.words:
             # NOTE: Is it OK that we're evaluating these as we go?
             # TODO: test it out in a loop
             pat_val = self.word_ev.EvalWordToString(pat_word,