Skip to content

Commit

Permalink
[j8] Start lexer for J8 Lines.
Browse files Browse the repository at this point in the history
It's different than JSON8 / NIL8.

I realized we need to make ASCII whitespace rules more consistent across
Oils.  mycpp s.strip() uses libc isspace(), while regexes use [ \t\r\n]
and similar.
  • Loading branch information
Andy Chu committed Apr 18, 2024
1 parent 130e8bc commit d60c00e
Show file tree
Hide file tree
Showing 10 changed files with 164 additions and 6 deletions.
8 changes: 8 additions & 0 deletions cpp/frontend_match.cc
Expand Up @@ -95,6 +95,14 @@ Tuple2<Id_t, int> MatchJ8Token(BigStr* s, int pos) {
return Tuple2<Id_t, int>(static_cast<Id_t>(id), end_pos);
}

Tuple2<Id_t, int> MatchJ8LinesToken(BigStr* s, int pos) {
int id;
int end_pos;
::MatchJ8LinesToken(reinterpret_cast<const unsigned char*>(s->data_), len(s),
pos, &id, &end_pos);
return Tuple2<Id_t, int>(static_cast<Id_t>(id), end_pos);
}

Tuple2<Id_t, int> MatchJ8StrToken(BigStr* s, int pos) {
int id;
int end_pos;
Expand Down
1 change: 1 addition & 0 deletions cpp/frontend_match.h
Expand Up @@ -67,6 +67,7 @@ Id_t BracketBinary(BigStr* s);
Id_t BracketOther(BigStr* s);

Tuple2<Id_t, int> MatchJ8Token(BigStr* s, int pos);
Tuple2<Id_t, int> MatchJ8LinesToken(BigStr* s, int pos);
Tuple2<Id_t, int> MatchJ8StrToken(BigStr* s, int pos);
Tuple2<Id_t, int> MatchJsonStrToken(BigStr* s, int pos);

Expand Down
91 changes: 89 additions & 2 deletions data_lang/j8.py
Expand Up @@ -576,7 +576,19 @@ def Next(self):
"Comments aren't part of JSON; you may want 'json8 read'",
end_pos)

# Non-string tokens like { } null etc.
if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
Id.Left_USingleQuote):
return self._DecodeString(tok_id, end_pos)

self.pos = end_pos
return tok_id, end_pos, None

def J8LinesNext(self):
# type: () -> Tuple[Id_t, int, Optional[str]]
""" Like Next(), but for J8 Lines """

tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)

if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
Id.Left_USingleQuote):
return self._DecodeString(tok_id, end_pos)
Expand Down Expand Up @@ -1050,11 +1062,81 @@ def ParseNil8(self):
return obj


class J8LinesParser(_Parser):
"""
Line-based grammar:
j8_line = Space? string Space? (Op_Newline | Eof_Real)
string = J8_String
| (Lit_Chars (Space (Lit_Chars | Left-*) )*
where Lit_Chars is valid UTF-8
Note that "" and u'' here are not a decoded string, because the line
started with literals:
foo "" u''
Using the grammar style preserves location info, reduces allocations.
- TODO: We something like LexerDecoder, but with a J8_LINES mode
- _Parser should take the outer lexer.
"""

def __init__(self, s):
# type: (str) -> None
_Parser.__init__(self, s, True)

def ParseLine(self):
# type: () -> Optional[str]
pass

def _Parse(self):
# type: () -> List[str]

log('tok_id %s', Id_str(self.tok_id))

if self.tok_id == Id.WS_Space:
self._Next()

if self.tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
Id.Left_USingleQuote):
#tok_id, end_pos, s = self._DecodeString(tok_id, end_pos)
#log('s %r', s)
pass

if self.tok_id == Id.WS_Space:
self._Next()
pass
return None

def _Show(self, s):
# type: (str) -> None
log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
self.end_pos)

def Parse(self):
# type: () -> List[str]
""" Raises error.Decode. """

self._Show('1')

# TODO: J8LinesNext()
self._Next()
self._Show('2')
lines = self._Parse()

if self.tok_id != Id.Eol_Tok:
raise self._Error('Unexpected trailing input')
return lines


# types of lines
UNQUOTED = 0
SINGLE = 1
DOUBLE = 2


def SplitJ8Lines(s):
# type: (str) -> List[str]
"""Used by @(echo split command sub)
Expand All @@ -1079,6 +1161,10 @@ def SplitJ8Lines(s):
Or we can also just reuse the same LexerDecoder for each line? Reset()?
Then we get the line number for free.
"""
p = J8LinesParser(s)
# raises error.Decode
return p.Parse()

lines = s.split('\n')
strs = [] # type: List[str]
for line in lines:
Expand All @@ -1090,7 +1176,8 @@ def SplitJ8Lines(s):
continue

left_type = UNQUOTED
if (line.startswith("u'") or line.startswith("b'") or line.startswith("'")):
if (line.startswith("u'") or line.startswith("b'") or
line.startswith("'")):
left_type = SINGLE
elif line.startswith('"'):
left_type = DOUBLE
Expand Down
13 changes: 12 additions & 1 deletion frontend/lexer_def.py
Expand Up @@ -549,12 +549,15 @@ def R(pat, tok_type):
r'[a-zA-Z0-9' + J8_SYMBOL_CHARS + ']*')
# yapf: enable

J8_DEF = [
_J8_LEFT = [
C('"', Id.Left_DoubleQuote), # JSON string
# Three left quotes that are J8 only
C("u'", Id.Left_USingleQuote), # unicode string
C("'", Id.Left_USingleQuote), # '' is alias for u'' in data, not in code
C("b'", Id.Left_BSingleQuote), # byte string
]

J8_DEF = _J8_LEFT + [
C('[', Id.J8_LBracket),
C(']', Id.J8_RBracket),
C('{', Id.J8_LBrace),
Expand Down Expand Up @@ -592,6 +595,14 @@ def R(pat, tok_type):
R(r'[^\0]', Id.Unknown_Tok),
]

J8_LINES_DEF = _J8_LEFT + [
R(r'[ \r\t]+', Id.WS_Space),
R(r'[\n]', Id.Op_Newline),

# not space or ' or " or EOF
R(r'''[^ \r\t'"\0]+''', Id.Lit_Chars),
]

# Exclude control characters 0x00-0x1f, aka 0-31 in J8 data
# But \n has to be allowed in multi-line strings
_ASCII_CONTROL = R(r'[\x01-\x1F]', Id.Char_AsciiControl)
Expand Down
1 change: 1 addition & 0 deletions frontend/lexer_gen.py
Expand Up @@ -436,6 +436,7 @@ def main(argv):
TranslateSimpleLexer('MatchHistoryToken', lexer_def.HISTORY_DEF)
TranslateSimpleLexer('MatchBraceRangeToken', lexer_def.BRACE_RANGE_DEF)
TranslateSimpleLexer('MatchJ8Token', lexer_def.J8_DEF)
TranslateSimpleLexer('MatchJ8LinesToken', lexer_def.J8_LINES_DEF)
TranslateSimpleLexer('MatchJ8StrToken', lexer_def.J8_STR_DEF)
TranslateSimpleLexer('MatchJsonStrToken', lexer_def.JSON_STR_DEF)

Expand Down
8 changes: 8 additions & 0 deletions frontend/match.py
Expand Up @@ -131,6 +131,12 @@ def _MatchJ8Token_Fast(line, start_pos):
return tok_type, end_pos


def _MatchJ8LinesToken_Fast(line, start_pos):
# type: (str, int) -> Tuple[Id_t, int]
tok_type, end_pos = fastlex.MatchJ8LinesToken(line, start_pos)
return tok_type, end_pos


def _MatchJ8StrToken_Fast(line, start_pos):
# type: (str, int) -> Tuple[Id_t, int]
tok_type, end_pos = fastlex.MatchJ8StrToken(line, start_pos)
Expand All @@ -152,6 +158,7 @@ def _MatchJsonStrToken_Fast(line, start_pos):
BRACE_RANGE_MATCHER = _MatchBraceRangeToken_Fast

MatchJ8Token = _MatchJ8Token_Fast
MatchJ8LinesToken = _MatchJ8LinesToken_Fast
MatchJ8StrToken = _MatchJ8StrToken_Fast
MatchJsonStrToken = _MatchJsonStrToken_Fast

Expand All @@ -168,6 +175,7 @@ def _MatchJsonStrToken_Fast(line, start_pos):
BRACE_RANGE_MATCHER = _MatchTokenSlow(lexer_def.BRACE_RANGE_DEF)

MatchJ8Token = _MatchTokenSlow(lexer_def.J8_DEF)
MatchJ8LinesToken = _MatchTokenSlow(lexer_def.J8_LINES_DEF)
MatchJ8StrToken = _MatchTokenSlow(lexer_def.J8_STR_DEF)
MatchJsonStrToken = _MatchTokenSlow(lexer_def.JSON_STR_DEF)

Expand Down
15 changes: 15 additions & 0 deletions frontend/match_test.py
Expand Up @@ -59,6 +59,21 @@ def testJ8Lexer(self):
lex = match.SimpleLexer(match.MatchJ8Token, s)
_PrintTokens(lex)

def testJ8LinesLexer(self):
cases = [
' "hello"',
" u'hi",
" b'hi",
" 'hi",
' multiple words ',
]

for s in cases:
log('---')
log('J8 LINES CASE %r', s)
lex = match.SimpleLexer(match.MatchJ8LinesToken, s)
_PrintTokens(lex)

def testJ8StrLexer(self):
cases = [
'"hi"',
Expand Down
27 changes: 27 additions & 0 deletions pyext/fastlex.c
Expand Up @@ -195,6 +195,31 @@ fastlex_MatchJ8Token(PyObject *self, PyObject *args) {
return Py_BuildValue("(ii)", id, end_pos);
}

static PyObject *
fastlex_MatchJ8LinesToken(PyObject *self, PyObject *args) {
unsigned char* line;
int line_len;

int start_pos;
if (!PyArg_ParseTuple(args, "s#i", &line, &line_len, &start_pos)) {
return NULL;
}

// Bounds checking.
if (start_pos > line_len) {
PyErr_Format(PyExc_ValueError,
"Invalid MatchJ8LinesToken call (start_pos = %d, line_len = %d)",
start_pos, line_len);
return NULL;
}

int id;
int end_pos;
MatchJ8LinesToken(line, line_len, start_pos, &id, &end_pos);
return Py_BuildValue("(ii)", id, end_pos);
}


static PyObject *
fastlex_MatchJ8StrToken(PyObject *self, PyObject *args) {
unsigned char* line;
Expand Down Expand Up @@ -305,6 +330,8 @@ static PyMethodDef methods[] = {
"(line, start_pos) -> (id, end_pos)."},
{"MatchJ8Token", fastlex_MatchJ8Token, METH_VARARGS,
"(line, start_pos) -> (id, end_pos)."},
{"MatchJ8LinesToken", fastlex_MatchJ8LinesToken, METH_VARARGS,
"(line, start_pos) -> (id, end_pos)."},
{"MatchJ8StrToken", fastlex_MatchJ8StrToken, METH_VARARGS,
"(line, start_pos) -> (id, end_pos)."},
{"MatchJsonStrToken", fastlex_MatchJsonStrToken, METH_VARARGS,
Expand Down
1 change: 1 addition & 0 deletions pyext/fastlex.pyi
Expand Up @@ -12,6 +12,7 @@ def MatchHistoryToken(line: str, start_pos: int) -> Tuple[int, int]: ...
def MatchGlobToken(line: str, start_pos: int) -> Tuple[int, int]: ...
def MatchBraceRangeToken(line: str, start_pos: int) -> Tuple[int, int]: ...
def MatchJ8Token(line: str, start_pos: int) -> Tuple[int, int]: ...
def MatchJ8LinesToken(line: str, start_pos: int) -> Tuple[int, int]: ...
def MatchJ8StrToken(line: str, start_pos: int) -> Tuple[int, int]: ...
def MatchJsonStrToken(line: str, start_pos: int) -> Tuple[int, int]: ...

Expand Down
5 changes: 2 additions & 3 deletions ysh/expr_to_ast.py
Expand Up @@ -805,9 +805,8 @@ def _CheckLhs(self, lhs):

else:
# Illegal - e.g. setglobal {}["key"] = 42
p_die(
"Subscript/Attribute not allowed on this LHS expression",
location.TokenForExpr(lhs))
p_die("Subscript/Attribute not allowed on this LHS expression",
location.TokenForExpr(lhs))

def _LhsExprList(self, p_node):
# type: (PNode) -> List[y_lhs_t]
Expand Down

0 comments on commit d60c00e

Please sign in to comment.