Skip to content

Commit

Permalink
[refactor] Move split state machine to frontend/consts.py.
Browse files Browse the repository at this point in the history
We can't translate constant dicts at the moment.

Reorganize imports in osh/word_eval.py.
  • Loading branch information
Andy Chu committed Feb 14, 2020
1 parent 58f2372 commit ad1635d
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 112 deletions.
15 changes: 7 additions & 8 deletions build/dev.sh
@@ -1,14 +1,11 @@
#!/usr/bin/env bash
#
# Set up a development build of Oil on CPython.
# This is in constrast to the release build, which bundles Oil with "OVM" (a
# This is in contrast to the release build, which bundles Oil with "OVM" (a
# slight fork of CPython).

# Build Python extension modules. We use symlinks instead of installing them
# globally (or using virtualenv).
#
# Usage:
# ./pybuild.sh <function name>
# build/dev.sh <function name>

set -o nounset
set -o pipefail
Expand Down Expand Up @@ -102,14 +99,16 @@ gen-asdl-cpp() {
oil-asdl-to-py() {
OPTIONAL_FIELDS='' PRETTY_PRINT_METHODS='' gen-asdl-py 'asdl/hnode.asdl'

gen-asdl-py frontend/types.asdl # no dependency on Id
gen-asdl-py frontend/types.asdl
gen-asdl-py osh/runtime.asdl
gen-asdl-py 'tools/find/find.asdl'

build/codegen.sh const-mypy-gen # dependency on bool_arg_type_e
build/codegen.sh option-mypy-gen

# does __import__ of syntax_abbrev.py, which depends on Id. We could use the
# AST module later?
gen-asdl-py frontend/syntax.asdl 'frontend.syntax_abbrev'
gen-asdl-py osh/runtime.asdl
gen-asdl-py 'tools/find/find.asdl'
}

arith-parse-cpp-gen() {
Expand Down
93 changes: 93 additions & 0 deletions frontend/consts.py
Expand Up @@ -92,3 +92,96 @@ def RedirArgType(id_):
def RedirDefaultFd(id_):
# type: (Id_t) -> int
return REDIR_DEFAULT_FD[id_]


#
# Constants used by osh/split.py
#

# IFS splitting is complicated in general. We handle it with three concepts:
#
# - CH.* - Kinds of characters (edge labels)
# - ST.* - States (node labels)
# - EMIT.* Actions
#
# The Split() loop below classifies characters, follows state transitions, and
# emits spans. A span is a (ignored Bool, end_index Int) pair.

# As an example, consider this string:
# 'a _ b'
#
# The character classes are:
#
# a ' ' _ ' ' b
# Black DE_White DE_Gray DE_White Black
#
# The states are:
#
# a ' ' _ ' ' b
# Black DE_White1 DE_Gray DE_White2 Black
#
# DE_White2 is whitespace that follows a "gray" non-whitespace IFS character.
#
# The spans emitted are:
#
# (part 'a', ignored ' _ ', part 'b')

# SplitForRead() will check if the last two spans are a \ and \\n. Easy.


# Shorter names for state machine enums
from _devbuild.gen.runtime_asdl import emit_e as EMIT
from _devbuild.gen.runtime_asdl import char_kind_e as CH
from _devbuild.gen.runtime_asdl import state_e as ST

TRANSITIONS = {
# Whitespace should have been stripped
(ST.Start, CH.DE_White): (ST.Invalid, EMIT.Nothing), # ' '
(ST.Start, CH.DE_Gray): (ST.DE_Gray, EMIT.Empty), # '_'
(ST.Start, CH.Black): (ST.Black, EMIT.Nothing), # 'a'
(ST.Start, CH.Backslash): (ST.Backslash, EMIT.Nothing), # '\'

(ST.DE_White1, CH.DE_White): (ST.DE_White1, EMIT.Nothing), # ' '
(ST.DE_White1, CH.DE_Gray): (ST.DE_Gray, EMIT.Nothing), # ' _'
(ST.DE_White1, CH.Black): (ST.Black, EMIT.Delim), # ' a'
(ST.DE_White1, CH.Backslash): (ST.Backslash, EMIT.Delim), # ' \'

(ST.DE_Gray, CH.DE_White): (ST.DE_White2, EMIT.Nothing), # '_ '
(ST.DE_Gray, CH.DE_Gray): (ST.DE_Gray, EMIT.Empty), # '__'
(ST.DE_Gray, CH.Black): (ST.Black, EMIT.Delim), # '_a'
(ST.DE_Gray, CH.Backslash): (ST.Black, EMIT.Delim), # '_\'

(ST.DE_White2, CH.DE_White): (ST.DE_White2, EMIT.Nothing), # '_ '
(ST.DE_White2, CH.DE_Gray): (ST.DE_Gray, EMIT.Empty), # '_ _'
(ST.DE_White2, CH.Black): (ST.Black, EMIT.Delim), # '_ a'
(ST.DE_White2, CH.Backslash): (ST.Backslash, EMIT.Delim), # '_ \'

(ST.Black, CH.DE_White): (ST.DE_White1, EMIT.Part), # 'a '
(ST.Black, CH.DE_Gray): (ST.DE_Gray, EMIT.Part), # 'a_'
(ST.Black, CH.Black): (ST.Black, EMIT.Nothing), # 'aa'
(ST.Black, CH.Backslash): (ST.Backslash, EMIT.Part), # 'a\'

# Here we emit an ignored \ and the second character as well.
# We're emitting TWO spans here; we don't wait until the subsequent
# character. That is OK.
#
# Problem: if '\ ' is the last one, we don't want to emit a trailing span?
# In all other cases we do.

(ST.Backslash, CH.DE_White): (ST.Black, EMIT.Escape), # '\ '
(ST.Backslash, CH.DE_Gray): (ST.Black, EMIT.Escape), # '\_'
(ST.Backslash, CH.Black): (ST.Black, EMIT.Escape), # '\a'
# NOTE: second character is a backslash, but new state is ST.Black!
(ST.Backslash, CH.Backslash): (ST.Black, EMIT.Escape), # '\\'
}

LAST_SPAN_ACTION = {
ST.Black: EMIT.Part,
ST.Backslash: EMIT.Escape,
# Ignore trailing IFS whitespace too. This is necessary for the case:
# IFS=':' ; read x y z <<< 'a : b : c :'.
ST.DE_White1: EMIT.Nothing,
ST.DE_Gray: EMIT.Delim,
ST.DE_White2: EMIT.Delim,
}

1 change: 0 additions & 1 deletion osh/builtin_bracket.py
Expand Up @@ -17,7 +17,6 @@
from asdl import runtime
from core import error
from core.util import p_die

from osh import sh_expr_eval
from osh import bool_parse
from osh import word_parse
Expand Down
2 changes: 1 addition & 1 deletion osh/runtime.asdl
Expand Up @@ -104,7 +104,7 @@ module runtime
Proc(int code)
| Pipeline(int* codes)

-- For word splitting in legacy.py
-- For word splitting (in frontend/consts.py and osh/split.py)
span = Black | Delim | Backslash
emit = Part | Delim | Empty | Escape | Nothing
state = Invalid | Start | DE_White1 | DE_Gray | DE_White2 | Black | Backslash
Expand Down
88 changes: 3 additions & 85 deletions osh/split.py
Expand Up @@ -35,6 +35,7 @@

from core import util
from core.util import log
from frontend import consts
from mycpp import mylib
from mycpp.mylib import tagswitch

Expand Down Expand Up @@ -230,89 +231,6 @@ def Split(self, s, allow_escape):
raise NotImplementedError()


# IFS splitting is complicated in general. We handle it with three concepts:
#
# - CH.* - Kinds of characters (edge labels)
# - ST.* - States (node labels)
# - EMIT.* Actions
#
# The Split() loop below classifies characters, follows state transitions, and
# emits spans. A span is a (ignored Bool, end_index Int) pair.

# As an example, consider this string:
# 'a _ b'
#
# The character classes are:
#
# a ' ' _ ' ' b
# Black DE_White DE_Gray DE_White Black
#
# The states are:
#
# a ' ' _ ' ' b
# Black DE_White1 DE_Gray DE_White2 Black
#
# DE_White2 is whitespace that follows a "gray" non-whitespace IFS character.
#
# The spans emitted are:
#
# (part 'a', ignored ' _ ', part 'b')

# SplitForRead() will check if the last two spans are a \ and \\n. Easy.


TRANSITIONS = {
# Whitespace should have been stripped
(ST.Start, CH.DE_White): (ST.Invalid, EMIT.Nothing), # ' '
(ST.Start, CH.DE_Gray): (ST.DE_Gray, EMIT.Empty), # '_'
(ST.Start, CH.Black): (ST.Black, EMIT.Nothing), # 'a'
(ST.Start, CH.Backslash): (ST.Backslash, EMIT.Nothing), # '\'

(ST.DE_White1, CH.DE_White): (ST.DE_White1, EMIT.Nothing), # ' '
(ST.DE_White1, CH.DE_Gray): (ST.DE_Gray, EMIT.Nothing), # ' _'
(ST.DE_White1, CH.Black): (ST.Black, EMIT.Delim), # ' a'
(ST.DE_White1, CH.Backslash): (ST.Backslash, EMIT.Delim), # ' \'

(ST.DE_Gray, CH.DE_White): (ST.DE_White2, EMIT.Nothing), # '_ '
(ST.DE_Gray, CH.DE_Gray): (ST.DE_Gray, EMIT.Empty), # '__'
(ST.DE_Gray, CH.Black): (ST.Black, EMIT.Delim), # '_a'
(ST.DE_Gray, CH.Backslash): (ST.Black, EMIT.Delim), # '_\'

(ST.DE_White2, CH.DE_White): (ST.DE_White2, EMIT.Nothing), # '_ '
(ST.DE_White2, CH.DE_Gray): (ST.DE_Gray, EMIT.Empty), # '_ _'
(ST.DE_White2, CH.Black): (ST.Black, EMIT.Delim), # '_ a'
(ST.DE_White2, CH.Backslash): (ST.Backslash, EMIT.Delim), # '_ \'

(ST.Black, CH.DE_White): (ST.DE_White1, EMIT.Part), # 'a '
(ST.Black, CH.DE_Gray): (ST.DE_Gray, EMIT.Part), # 'a_'
(ST.Black, CH.Black): (ST.Black, EMIT.Nothing), # 'aa'
(ST.Black, CH.Backslash): (ST.Backslash, EMIT.Part), # 'a\'

# Here we emit an ignored \ and the second character as well.
# We're emitting TWO spans here; we don't wait until the subsequent
# character. That is OK.
#
# Problem: if '\ ' is the last one, we don't want to emit a trailing span?
# In all other cases we do.

(ST.Backslash, CH.DE_White): (ST.Black, EMIT.Escape), # '\ '
(ST.Backslash, CH.DE_Gray): (ST.Black, EMIT.Escape), # '\_'
(ST.Backslash, CH.Black): (ST.Black, EMIT.Escape), # '\a'
# NOTE: second character is a backslash, but new state is ST.Black!
(ST.Backslash, CH.Backslash): (ST.Black, EMIT.Escape), # '\\'
}

LAST_SPAN_ACTION = {
ST.Black: EMIT.Part,
ST.Backslash: EMIT.Escape,
# Ignore trailing IFS whitespace too. This is necessary for the case:
# IFS=':' ; read x y z <<< 'a : b : c :'.
ST.DE_White1: EMIT.Nothing,
ST.DE_Gray: EMIT.Delim,
ST.DE_White2: EMIT.Delim,
}


class IfsSplitter(_BaseSplitter):
"""Split a string when IFS has non-whitespace characters."""

Expand Down Expand Up @@ -373,7 +291,7 @@ def Split(self, s, allow_escape):
else:
ch = CH.Black

new_state, action = TRANSITIONS[state, ch]
new_state, action = consts.TRANSITIONS[state, ch]
if new_state == ST.Invalid:
raise AssertionError(
'Invalid transition from %r with %r' % (state, ch))
Expand All @@ -399,7 +317,7 @@ def Split(self, s, allow_escape):
state = new_state
i += 1

last_action = LAST_SPAN_ACTION[state]
last_action = consts.LAST_SPAN_ACTION[state]
#log('n %d state %s last_action %s', n, state, last_action)

if last_action == EMIT.Part:
Expand Down
33 changes: 16 additions & 17 deletions osh/word_eval.py
Expand Up @@ -6,42 +6,41 @@
from _devbuild.gen.syntax_asdl import (
braced_var_sub, Token,
word, word_e, word_t, compound_word,
bracket_op_e, suffix_op_e, word_part_e,
bracket_op__ArrayIndex, bracket_op__WholeArray,
suffix_op__Nullary, suffix_op__PatSub, suffix_op__Slice,
suffix_op__Unary, sh_array_literal,
single_quoted, double_quoted, simple_var_sub, command_sub,
word_part__ArithSub, word_part__EscapedLiteral,
word_part__AssocArrayLiteral, word_part__ExprSub,
word_part__ExtGlob, word_part__FuncCall,
word_part__Splice, word_part__TildeSub,
bracket_op_e, bracket_op__ArrayIndex, bracket_op__WholeArray,
suffix_op_e, suffix_op__Nullary, suffix_op__PatSub, suffix_op__Slice,
suffix_op__Unary,
sh_array_literal, single_quoted, double_quoted, simple_var_sub,
command_sub,
word_part_e, word_part__ArithSub, word_part__EscapedLiteral,
word_part__AssocArrayLiteral, word_part__ExprSub, word_part__ExtGlob,
word_part__FuncCall, word_part__Splice, word_part__TildeSub,
)
from _devbuild.gen.runtime_asdl import (
builtin_e, effect_e,
part_value, part_value_e, part_value_t, part_value__String, part_value__Array,
value, value_e, value_t, lvalue,
assign_arg,
part_value, part_value_e, part_value_t, part_value__String,
part_value__Array,
value, value_e, value_t, value__Str, value__AssocArray,
value__MaybeStrArray, value__Obj,
lvalue, assign_arg,
cmd_value_e, cmd_value_t, cmd_value, cmd_value__Assign, cmd_value__Argv,
value__Str, value__AssocArray, value__MaybeStrArray, value__Obj,
quote_e, quote_t,
)
from core import builtin_def
from core import error
from core import passwd
from core import process
from core import state
from core.util import log, e_die, e_strict
from frontend import consts
from frontend import match
from mycpp.mylib import tagswitch
from mycpp import mylib
from osh import braces
from osh import glob_
from osh import string_ops
from core import state
from osh import word_
from osh import word_compile

from mycpp.mylib import tagswitch
from mycpp import mylib

import posix_ as posix

from typing import Optional, Tuple, List, Dict, cast, TYPE_CHECKING
Expand Down

0 comments on commit ad1635d

Please sign in to comment.