Skip to content

Commit

Permalink
[j8] Start decoding strings in the lexer
Browse files Browse the repository at this point in the history
  • Loading branch information
Andy C committed Dec 26, 2023
1 parent 9e3873e commit 8456b0a
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 16 deletions.
57 changes: 43 additions & 14 deletions data_lang/pyj8.py
Expand Up @@ -6,9 +6,7 @@
from mycpp import mylib
from mycpp.mylib import log

from typing import Tuple, List, Optional, TYPE_CHECKING
if TYPE_CHECKING:
from frontend.match import SimpleMatchFunc
from typing import Tuple, List, Optional

_ = log

Expand Down Expand Up @@ -187,25 +185,56 @@ def __init__(self, s):
# type: (str) -> None
self.s = s
self.pos = 0
self.state = 0 # 0 for outer, 1 inside string
self.decoded = mylib.BufWriter()

def Next(self):
# type: () -> Tuple[Id_t, int, Optional[str]]
"""
Note: match_func will return Id.Eol_Tok repeatedly the terminating NUL
"""
if self.state == 0:
tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
else:
tok_id, end_pos = match.MatchJ8StrToken(self.s, self.pos)
tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)

if tok_id not in (Id.J8_LeftQuote, Id.J8_LeftBQuote, Id.J8_LeftUQuote):
self.pos = end_pos
return tok_id, end_pos, None

str_pos = end_pos
while True:
tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
if tok_id == Id.Eol_Tok:
raise AssertionError()
if tok_id == Id.Unknown_Tok:
# backslash etc.
raise AssertionError()

if tok_id == Id.Right_DoubleQuote:
self.pos = str_end

s = self.decoded.getvalue()
# TODO: clear() to reduce GC pressure
self.decoded = mylib.BufWriter()
return Id.J8_AnyString, str_end, s

if tok_id == Id.Char_OneChar: # JSON and J8
part = 'x'

if tok_id in (Id.J8_LeftQuote, Id.J8_LeftBQuote, Id.J8_LeftUQuote):
self.state = 1
elif tok_id == Id.Right_DoubleQuote:
self.state = 0
elif tok_id == Id.Char_Literals: # JSON and J8
part = self.s[str_pos:str_end]

elif tok_id == Id.Char_Unicode4: # JSON only
part = 'y'

elif tok_id == Id.Char_YHex: # J8 only
part = 'z'

elif tok_id == Id.Char_UBraced: # J8 only
part = 'u'

else:
raise AssertionError(Id_str(tok_id))

self.pos = end_pos
return tok_id, end_pos, None
self.decoded.write(part)
str_pos = str_end


def Decode(s, mode, buf):
Expand Down
3 changes: 2 additions & 1 deletion data_lang/pyj8_test.py
Expand Up @@ -18,7 +18,8 @@ def _PrintTokens(lex):
while True:
id_, end_pos, decoded = lex.Next()
val = lex.s[pos:end_pos]
log(' %20s %10r %r', Id_str(id_), val, decoded)
d = repr(decoded) if decoded is not None else '-'
log(' %20s %15r %15s', Id_str(id_), val, d)
if id_ == Id.Eol_Tok:
break
pos = end_pos
Expand Down
2 changes: 1 addition & 1 deletion frontend/lexer_def.py
Expand Up @@ -524,7 +524,7 @@ def R(pat, tok_type):
J8_DEF = [
C('"', Id.J8_LeftQuote),
# j"" makes sense in YSH code, u"" makes sense in JSON
C('j"', Id.J8_LeftUQuote),
C('u"', Id.J8_LeftUQuote),
C('b"', Id.J8_LeftBQuote),

C('[', Id.J8_LBracket),
Expand Down

0 comments on commit 8456b0a

Please sign in to comment.