[j8] Start lexer for J8 Lines.

It's different than JSON8 / NIL8. I realized we need to make ASCII whitespace rules more consistent across Oils. mycpp s.strip() uses libc isspace(), while regexes use [ \t\r\n] and similar.
oilshell · Apr 18, 2024 · d60c00e · d60c00e
1 parent 130e8bc
commit d60c00e
Show file tree

Hide file tree

Showing 10 changed files with 164 additions and 6 deletions.
diff --git a/cpp/frontend_match.cc b/cpp/frontend_match.cc
@@ -95,6 +95,14 @@ Tuple2<Id_t, int> MatchJ8Token(BigStr* s, int pos) {
   return Tuple2<Id_t, int>(static_cast<Id_t>(id), end_pos);
 }
 
+Tuple2<Id_t, int> MatchJ8LinesToken(BigStr* s, int pos) {
+  int id;
+  int end_pos;
+  ::MatchJ8LinesToken(reinterpret_cast<const unsigned char*>(s->data_), len(s),
+                      pos, &id, &end_pos);
+  return Tuple2<Id_t, int>(static_cast<Id_t>(id), end_pos);
+}
+
 Tuple2<Id_t, int> MatchJ8StrToken(BigStr* s, int pos) {
   int id;
   int end_pos;

diff --git a/cpp/frontend_match.h b/cpp/frontend_match.h
@@ -67,6 +67,7 @@ Id_t BracketBinary(BigStr* s);
 Id_t BracketOther(BigStr* s);
 
 Tuple2<Id_t, int> MatchJ8Token(BigStr* s, int pos);
+Tuple2<Id_t, int> MatchJ8LinesToken(BigStr* s, int pos);
 Tuple2<Id_t, int> MatchJ8StrToken(BigStr* s, int pos);
 Tuple2<Id_t, int> MatchJsonStrToken(BigStr* s, int pos);
 

diff --git a/data_lang/j8.py b/data_lang/j8.py
@@ -576,7 +576,19 @@ def Next(self):
                     "Comments aren't part of JSON; you may want 'json8 read'",
                     end_pos)
 
-        # Non-string tokens like { } null etc.
+        if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
+                      Id.Left_USingleQuote):
+            return self._DecodeString(tok_id, end_pos)
+
+        self.pos = end_pos
+        return tok_id, end_pos, None
+
+    def J8LinesNext(self):
+        # type: () -> Tuple[Id_t, int, Optional[str]]
+        """ Like Next(), but for J8 Lines """
+
+        tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
+
         if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
                       Id.Left_USingleQuote):
             return self._DecodeString(tok_id, end_pos)
@@ -1050,11 +1062,81 @@ def ParseNil8(self):
         return obj
 
 
+class J8LinesParser(_Parser):
+    """
+    Line-based grammar:
+
+        j8_line  = Space? string Space? (Op_Newline | Eof_Real)
+        string = J8_String
+               | (Lit_Chars (Space (Lit_Chars | Left-*) )*
+
+    where Lit_Chars is valid UTF-8
+
+    Note that "" and u'' here are not a decoded string, because the line
+    started with literals:
+
+        foo "" u''
+
+    Using the grammar style preserves location info, reduces allocations.
+
+    - TODO: We something like LexerDecoder, but with a J8_LINES mode
+    - _Parser should take the outer lexer.
+    """
+
+    def __init__(self, s):
+        # type: (str) -> None
+        _Parser.__init__(self, s, True)
+
+    def ParseLine(self):
+        # type: () -> Optional[str]
+        pass
+
+    def _Parse(self):
+        # type: () -> List[str]
+
+        log('tok_id %s', Id_str(self.tok_id))
+
+        if self.tok_id == Id.WS_Space:
+            self._Next()
+
+        if self.tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
+                      Id.Left_USingleQuote):
+            #tok_id, end_pos, s = self._DecodeString(tok_id, end_pos)
+            #log('s %r', s)
+            pass
+
+        if self.tok_id == Id.WS_Space:
+            self._Next()
+        pass
+        return None
+
+    def _Show(self, s):
+        # type: (str) -> None
+        log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
+                self.end_pos)
+
+    def Parse(self):
+        # type: () -> List[str]
+        """ Raises error.Decode. """
+
+        self._Show('1')
+
+        # TODO: J8LinesNext()
+        self._Next()
+        self._Show('2')
+        lines = self._Parse()
+
+        if self.tok_id != Id.Eol_Tok:
+            raise self._Error('Unexpected trailing input')
+        return lines
+
+
 # types of lines
 UNQUOTED = 0
 SINGLE = 1
 DOUBLE = 2
 
+
 def SplitJ8Lines(s):
     # type: (str) -> List[str]
     """Used by @(echo split command sub)
@@ -1079,6 +1161,10 @@ def SplitJ8Lines(s):
     Or we can also just reuse the same LexerDecoder for each line?  Reset()?
     Then we get the line number for free.
     """
+    p = J8LinesParser(s)
+    # raises error.Decode
+    return p.Parse()
+
     lines = s.split('\n')
     strs = []  # type: List[str]
     for line in lines:
@@ -1090,7 +1176,8 @@ def SplitJ8Lines(s):
             continue
 
         left_type = UNQUOTED
-        if (line.startswith("u'") or line.startswith("b'") or line.startswith("'")):
+        if (line.startswith("u'") or line.startswith("b'") or
+                line.startswith("'")):
             left_type = SINGLE
         elif line.startswith('"'):
             left_type = DOUBLE

diff --git a/frontend/lexer_def.py b/frontend/lexer_def.py
@@ -549,12 +549,15 @@ def R(pat, tok_type):
     r'[a-zA-Z0-9' + J8_SYMBOL_CHARS + ']*')
 # yapf: enable
 
-J8_DEF = [
+_J8_LEFT = [
     C('"', Id.Left_DoubleQuote),  # JSON string
     # Three left quotes that are J8 only
     C("u'", Id.Left_USingleQuote),  # unicode string
     C("'", Id.Left_USingleQuote),  # '' is alias for u'' in data, not in code
     C("b'", Id.Left_BSingleQuote),  # byte string
+]
+
+J8_DEF = _J8_LEFT + [
     C('[', Id.J8_LBracket),
     C(']', Id.J8_RBracket),
     C('{', Id.J8_LBrace),
@@ -592,6 +595,14 @@ def R(pat, tok_type):
     R(r'[^\0]', Id.Unknown_Tok),
 ]
 
+J8_LINES_DEF = _J8_LEFT + [
+    R(r'[ \r\t]+', Id.WS_Space),
+    R(r'[\n]', Id.Op_Newline),
+
+    # not space or ' or " or EOF
+    R(r'''[^ \r\t'"\0]+''', Id.Lit_Chars),
+]
+
 # Exclude control characters 0x00-0x1f, aka 0-31 in J8 data
 # But \n has to be allowed in multi-line strings
 _ASCII_CONTROL = R(r'[\x01-\x1F]', Id.Char_AsciiControl)

diff --git a/frontend/lexer_gen.py b/frontend/lexer_gen.py
@@ -436,6 +436,7 @@ def main(argv):
         TranslateSimpleLexer('MatchHistoryToken', lexer_def.HISTORY_DEF)
         TranslateSimpleLexer('MatchBraceRangeToken', lexer_def.BRACE_RANGE_DEF)
         TranslateSimpleLexer('MatchJ8Token', lexer_def.J8_DEF)
+        TranslateSimpleLexer('MatchJ8LinesToken', lexer_def.J8_LINES_DEF)
         TranslateSimpleLexer('MatchJ8StrToken', lexer_def.J8_STR_DEF)
         TranslateSimpleLexer('MatchJsonStrToken', lexer_def.JSON_STR_DEF)
 

diff --git a/frontend/match.py b/frontend/match.py
@@ -131,6 +131,12 @@ def _MatchJ8Token_Fast(line, start_pos):
     return tok_type, end_pos
 
 
+def _MatchJ8LinesToken_Fast(line, start_pos):
+    # type: (str, int) -> Tuple[Id_t, int]
+    tok_type, end_pos = fastlex.MatchJ8LinesToken(line, start_pos)
+    return tok_type, end_pos
+
+
 def _MatchJ8StrToken_Fast(line, start_pos):
     # type: (str, int) -> Tuple[Id_t, int]
     tok_type, end_pos = fastlex.MatchJ8StrToken(line, start_pos)
@@ -152,6 +158,7 @@ def _MatchJsonStrToken_Fast(line, start_pos):
     BRACE_RANGE_MATCHER = _MatchBraceRangeToken_Fast
 
     MatchJ8Token = _MatchJ8Token_Fast
+    MatchJ8LinesToken = _MatchJ8LinesToken_Fast
     MatchJ8StrToken = _MatchJ8StrToken_Fast
     MatchJsonStrToken = _MatchJsonStrToken_Fast
 
@@ -168,6 +175,7 @@ def _MatchJsonStrToken_Fast(line, start_pos):
     BRACE_RANGE_MATCHER = _MatchTokenSlow(lexer_def.BRACE_RANGE_DEF)
 
     MatchJ8Token = _MatchTokenSlow(lexer_def.J8_DEF)
+    MatchJ8LinesToken = _MatchTokenSlow(lexer_def.J8_LINES_DEF)
     MatchJ8StrToken = _MatchTokenSlow(lexer_def.J8_STR_DEF)
     MatchJsonStrToken = _MatchTokenSlow(lexer_def.JSON_STR_DEF)
 

diff --git a/frontend/match_test.py b/frontend/match_test.py
@@ -59,6 +59,21 @@ def testJ8Lexer(self):
             lex = match.SimpleLexer(match.MatchJ8Token, s)
             _PrintTokens(lex)
 
+    def testJ8LinesLexer(self):
+        cases = [
+            ' "hello"',
+            " u'hi",
+            " b'hi",
+            " 'hi",
+            ' multiple words ',
+        ]
+
+        for s in cases:
+            log('---')
+            log('J8 LINES CASE %r', s)
+            lex = match.SimpleLexer(match.MatchJ8LinesToken, s)
+            _PrintTokens(lex)
+
     def testJ8StrLexer(self):
         cases = [
             '"hi"',

diff --git a/pyext/fastlex.c b/pyext/fastlex.c
@@ -195,6 +195,31 @@ fastlex_MatchJ8Token(PyObject *self, PyObject *args) {
   return Py_BuildValue("(ii)", id, end_pos);
 }
 
+static PyObject *
+fastlex_MatchJ8LinesToken(PyObject *self, PyObject *args) {
+  unsigned char* line;
+  int line_len;
+
+  int start_pos;
+  if (!PyArg_ParseTuple(args, "s#i", &line, &line_len, &start_pos)) {
+    return NULL;
+  }
+
+  // Bounds checking.
+  if (start_pos > line_len) {
+    PyErr_Format(PyExc_ValueError,
+                 "Invalid MatchJ8LinesToken call (start_pos = %d, line_len = %d)",
+                 start_pos, line_len);
+    return NULL;
+  }
+
+  int id;
+  int end_pos;
+  MatchJ8LinesToken(line, line_len, start_pos, &id, &end_pos);
+  return Py_BuildValue("(ii)", id, end_pos);
+}
+
+
 static PyObject *
 fastlex_MatchJ8StrToken(PyObject *self, PyObject *args) {
   unsigned char* line;
@@ -305,6 +330,8 @@ static PyMethodDef methods[] = {
    "(line, start_pos) -> (id, end_pos)."},
   {"MatchJ8Token", fastlex_MatchJ8Token, METH_VARARGS,
    "(line, start_pos) -> (id, end_pos)."},
+  {"MatchJ8LinesToken", fastlex_MatchJ8LinesToken, METH_VARARGS,
+   "(line, start_pos) -> (id, end_pos)."},
   {"MatchJ8StrToken", fastlex_MatchJ8StrToken, METH_VARARGS,
    "(line, start_pos) -> (id, end_pos)."},
   {"MatchJsonStrToken", fastlex_MatchJsonStrToken, METH_VARARGS,

diff --git a/pyext/fastlex.pyi b/pyext/fastlex.pyi
@@ -12,6 +12,7 @@ def MatchHistoryToken(line: str, start_pos: int) -> Tuple[int, int]: ...
 def MatchGlobToken(line: str, start_pos: int) -> Tuple[int, int]: ...
 def MatchBraceRangeToken(line: str, start_pos: int) -> Tuple[int, int]: ...
 def MatchJ8Token(line: str, start_pos: int) -> Tuple[int, int]: ...
+def MatchJ8LinesToken(line: str, start_pos: int) -> Tuple[int, int]: ...
 def MatchJ8StrToken(line: str, start_pos: int) -> Tuple[int, int]: ...
 def MatchJsonStrToken(line: str, start_pos: int) -> Tuple[int, int]: ...
 

diff --git a/ysh/expr_to_ast.py b/ysh/expr_to_ast.py
@@ -805,9 +805,8 @@ def _CheckLhs(self, lhs):
 
             else:
                 # Illegal - e.g. setglobal {}["key"] = 42
-                p_die(
-                    "Subscript/Attribute not allowed on this LHS expression",
-                    location.TokenForExpr(lhs))
+                p_die("Subscript/Attribute not allowed on this LHS expression",
+                      location.TokenForExpr(lhs))
 
     def _LhsExprList(self, p_node):
         # type: (PNode) -> List[y_lhs_t]