[refactor] Move split state machine to frontend/consts.py.

We can't translate constant dicts at the moment. Reorganize imports in osh/word_eval.py.
oilshell · Feb 14, 2020 · ad1635d · ad1635d
1 parent 58f2372
commit ad1635d
Show file tree

Hide file tree

Showing 6 changed files with 120 additions and 112 deletions.
diff --git a/build/dev.sh b/build/dev.sh
@@ -1,14 +1,11 @@
 #!/usr/bin/env bash
 #
 # Set up a development build of Oil on CPython.
-# This is in constrast to the release build, which bundles Oil with "OVM" (a
+# This is in contrast to the release build, which bundles Oil with "OVM" (a
 # slight fork of CPython).
-
-# Build Python extension modules.  We use symlinks instead of installing them
-# globally (or using virtualenv).
 #
 # Usage:
-#   ./pybuild.sh <function name>
+#   build/dev.sh <function name>
 
 set -o nounset
 set -o pipefail
@@ -102,14 +99,16 @@ gen-asdl-cpp() {
 oil-asdl-to-py() {
   OPTIONAL_FIELDS='' PRETTY_PRINT_METHODS='' gen-asdl-py 'asdl/hnode.asdl'
 
-  gen-asdl-py frontend/types.asdl  # no dependency on Id
+  gen-asdl-py frontend/types.asdl
+  gen-asdl-py osh/runtime.asdl
+  gen-asdl-py 'tools/find/find.asdl'
 
   build/codegen.sh const-mypy-gen  # dependency on bool_arg_type_e
   build/codegen.sh option-mypy-gen
 
+  # does __import__ of syntax_abbrev.py, which depends on Id.  We could use the
+  # AST module later?
   gen-asdl-py frontend/syntax.asdl 'frontend.syntax_abbrev'
-  gen-asdl-py osh/runtime.asdl
-  gen-asdl-py 'tools/find/find.asdl'
 }
 
 arith-parse-cpp-gen() {

diff --git a/frontend/consts.py b/frontend/consts.py
@@ -92,3 +92,96 @@ def RedirArgType(id_):
 def RedirDefaultFd(id_):
   # type: (Id_t) -> int
   return REDIR_DEFAULT_FD[id_]
+
+
+#
+# Constants used by osh/split.py
+#
+
+# IFS splitting is complicated in general.  We handle it with three concepts:
+#
+# - CH.* - Kinds of characters (edge labels)
+# - ST.* - States (node labels)
+# - EMIT.*  Actions
+#
+# The Split() loop below classifies characters, follows state transitions, and
+# emits spans.  A span is a (ignored Bool, end_index Int) pair.
+
+# As an example, consider this string:
+# 'a _ b'
+#
+# The character classes are:
+#
+# a      ' '        _        ' '        b
+# Black  DE_White   DE_Gray  DE_White   Black
+#
+# The states are:
+#
+# a      ' '        _        ' '        b
+# Black  DE_White1  DE_Gray  DE_White2  Black
+#
+# DE_White2 is whitespace that follows a "gray" non-whitespace IFS character.
+#
+# The spans emitted are:
+#
+# (part 'a', ignored ' _ ', part 'b')
+
+# SplitForRead() will check if the last two spans are a \ and \\n.  Easy.
+
+
+# Shorter names for state machine enums
+from _devbuild.gen.runtime_asdl import emit_e as EMIT
+from _devbuild.gen.runtime_asdl import char_kind_e as CH
+from _devbuild.gen.runtime_asdl import state_e as ST
+
+TRANSITIONS = {
+    # Whitespace should have been stripped
+    (ST.Start, CH.DE_White):  (ST.Invalid,   EMIT.Nothing),      # ' '
+    (ST.Start, CH.DE_Gray):   (ST.DE_Gray,   EMIT.Empty),        # '_'
+    (ST.Start, CH.Black):     (ST.Black,     EMIT.Nothing),      # 'a'
+    (ST.Start, CH.Backslash): (ST.Backslash, EMIT.Nothing),      # '\'
+
+    (ST.DE_White1, CH.DE_White):  (ST.DE_White1, EMIT.Nothing),  # '  '
+    (ST.DE_White1, CH.DE_Gray):   (ST.DE_Gray,   EMIT.Nothing),  # ' _'
+    (ST.DE_White1, CH.Black):     (ST.Black,     EMIT.Delim),    # ' a'
+    (ST.DE_White1, CH.Backslash): (ST.Backslash, EMIT.Delim),    # ' \'
+
+    (ST.DE_Gray, CH.DE_White):  (ST.DE_White2, EMIT.Nothing),    # '_ '
+    (ST.DE_Gray, CH.DE_Gray):   (ST.DE_Gray,   EMIT.Empty),      # '__'
+    (ST.DE_Gray, CH.Black):     (ST.Black,     EMIT.Delim),      # '_a'
+    (ST.DE_Gray, CH.Backslash): (ST.Black,     EMIT.Delim),      # '_\'
+
+    (ST.DE_White2, CH.DE_White):  (ST.DE_White2, EMIT.Nothing),  # '_  '
+    (ST.DE_White2, CH.DE_Gray):   (ST.DE_Gray,   EMIT.Empty),    # '_ _'
+    (ST.DE_White2, CH.Black):     (ST.Black,     EMIT.Delim),    # '_ a'
+    (ST.DE_White2, CH.Backslash): (ST.Backslash, EMIT.Delim),    # '_ \'
+
+    (ST.Black, CH.DE_White):  (ST.DE_White1, EMIT.Part),         # 'a '
+    (ST.Black, CH.DE_Gray):   (ST.DE_Gray,   EMIT.Part),         # 'a_'
+    (ST.Black, CH.Black):     (ST.Black,     EMIT.Nothing),      # 'aa'
+    (ST.Black, CH.Backslash): (ST.Backslash, EMIT.Part),         # 'a\'
+
+    # Here we emit an ignored \ and the second character as well.
+    # We're emitting TWO spans here; we don't wait until the subsequent
+    # character.  That is OK.
+    #
+    # Problem: if '\ ' is the last one, we don't want to emit a trailing span?
+    # In all other cases we do.
+
+    (ST.Backslash, CH.DE_White):  (ST.Black,     EMIT.Escape),   # '\ '
+    (ST.Backslash, CH.DE_Gray):   (ST.Black,     EMIT.Escape),   # '\_'
+    (ST.Backslash, CH.Black):     (ST.Black,     EMIT.Escape),   # '\a'
+    # NOTE: second character is a backslash, but new state is ST.Black!
+    (ST.Backslash, CH.Backslash): (ST.Black,     EMIT.Escape),   # '\\'
+}
+
+LAST_SPAN_ACTION = {
+    ST.Black: EMIT.Part,
+    ST.Backslash: EMIT.Escape,
+    # Ignore trailing IFS whitespace too.  This is necessary for the case:
+    # IFS=':' ; read x y z <<< 'a : b : c :'.
+    ST.DE_White1: EMIT.Nothing,
+    ST.DE_Gray: EMIT.Delim,
+    ST.DE_White2: EMIT.Delim,
+}
+
diff --git a/osh/builtin_bracket.py b/osh/builtin_bracket.py
@@ -17,7 +17,6 @@
 from asdl import runtime
 from core import error
 from core.util import p_die
-
 from osh import sh_expr_eval
 from osh import bool_parse
 from osh import word_parse

diff --git a/osh/runtime.asdl b/osh/runtime.asdl
@@ -104,7 +104,7 @@ module runtime
     Proc(int code)
   | Pipeline(int* codes)
 
-  -- For word splitting in legacy.py
+  -- For word splitting (in frontend/consts.py and osh/split.py)
   span = Black | Delim | Backslash
   emit = Part | Delim | Empty | Escape | Nothing
   state = Invalid | Start | DE_White1 | DE_Gray | DE_White2 | Black | Backslash

diff --git a/osh/split.py b/osh/split.py
@@ -35,6 +35,7 @@
 
 from core import util
 from core.util import log
+from frontend import consts
 from mycpp import mylib
 from mycpp.mylib import tagswitch
 
@@ -230,89 +231,6 @@ def Split(self, s, allow_escape):
     raise NotImplementedError()
 
 
-# IFS splitting is complicated in general.  We handle it with three concepts:
-#
-# - CH.* - Kinds of characters (edge labels)
-# - ST.* - States (node labels)
-# - EMIT.*  Actions
-#
-# The Split() loop below classifies characters, follows state transitions, and
-# emits spans.  A span is a (ignored Bool, end_index Int) pair.
-
-# As an example, consider this string:
-# 'a _ b'
-#
-# The character classes are:
-#
-# a      ' '        _        ' '        b
-# Black  DE_White   DE_Gray  DE_White   Black
-#
-# The states are:
-#
-# a      ' '        _        ' '        b
-# Black  DE_White1  DE_Gray  DE_White2  Black
-#
-# DE_White2 is whitespace that follows a "gray" non-whitespace IFS character.
-#
-# The spans emitted are:
-#
-# (part 'a', ignored ' _ ', part 'b')
-
-# SplitForRead() will check if the last two spans are a \ and \\n.  Easy.
-
-
-TRANSITIONS = {
-    # Whitespace should have been stripped
-    (ST.Start, CH.DE_White):  (ST.Invalid,   EMIT.Nothing),      # ' '
-    (ST.Start, CH.DE_Gray):   (ST.DE_Gray,   EMIT.Empty), # '_'
-    (ST.Start, CH.Black):     (ST.Black,     EMIT.Nothing),    # 'a'
-    (ST.Start, CH.Backslash): (ST.Backslash, EMIT.Nothing),    # '\'
-
-    (ST.DE_White1, CH.DE_White):  (ST.DE_White1, EMIT.Nothing),  # '  '
-    (ST.DE_White1, CH.DE_Gray):   (ST.DE_Gray,   EMIT.Nothing),  # ' _'
-    (ST.DE_White1, CH.Black):     (ST.Black,     EMIT.Delim),  # ' a'
-    (ST.DE_White1, CH.Backslash): (ST.Backslash, EMIT.Delim),  # ' \'
-
-    (ST.DE_Gray, CH.DE_White):  (ST.DE_White2, EMIT.Nothing),    # '_ '
-    (ST.DE_Gray, CH.DE_Gray):   (ST.DE_Gray,   EMIT.Empty), # '__'
-    (ST.DE_Gray, CH.Black):     (ST.Black,     EMIT.Delim),    # '_a'
-    (ST.DE_Gray, CH.Backslash): (ST.Black,     EMIT.Delim),    # '_\'
-
-    (ST.DE_White2, CH.DE_White):  (ST.DE_White2, EMIT.Nothing),    # '_  '
-    (ST.DE_White2, CH.DE_Gray):   (ST.DE_Gray,   EMIT.Empty), # '_ _'
-    (ST.DE_White2, CH.Black):     (ST.Black,     EMIT.Delim),    # '_ a'
-    (ST.DE_White2, CH.Backslash): (ST.Backslash, EMIT.Delim),    # '_ \'
-
-    (ST.Black, CH.DE_White):  (ST.DE_White1, EMIT.Part),  # 'a '
-    (ST.Black, CH.DE_Gray):   (ST.DE_Gray,   EMIT.Part),  # 'a_'
-    (ST.Black, CH.Black):     (ST.Black,     EMIT.Nothing),    # 'aa'
-    (ST.Black, CH.Backslash): (ST.Backslash, EMIT.Part),  # 'a\'
-
-    # Here we emit an ignored \ and the second character as well.
-    # We're emitting TWO spans here; we don't wait until the subsequent
-    # character.  That is OK.
-    #
-    # Problem: if '\ ' is the last one, we don't want to emit a trailing span?
-    # In all other cases we do.
-
-    (ST.Backslash, CH.DE_White):  (ST.Black,     EMIT.Escape),  # '\ '
-    (ST.Backslash, CH.DE_Gray):   (ST.Black,     EMIT.Escape),  # '\_'
-    (ST.Backslash, CH.Black):     (ST.Black,     EMIT.Escape),  # '\a'
-    # NOTE: second character is a backslash, but new state is ST.Black!
-    (ST.Backslash, CH.Backslash): (ST.Black,     EMIT.Escape),  # '\\'
-}
-
-LAST_SPAN_ACTION = {
-    ST.Black: EMIT.Part,
-    ST.Backslash: EMIT.Escape,
-    # Ignore trailing IFS whitespace too.  This is necessary for the case:
-    # IFS=':' ; read x y z <<< 'a : b : c :'.
-    ST.DE_White1: EMIT.Nothing,
-    ST.DE_Gray: EMIT.Delim,
-    ST.DE_White2: EMIT.Delim,
-}
-
-
 class IfsSplitter(_BaseSplitter):
   """Split a string when IFS has non-whitespace characters."""
 
@@ -373,7 +291,7 @@ def Split(self, s, allow_escape):
       else:
         ch = CH.Black
 
-      new_state, action = TRANSITIONS[state, ch]
+      new_state, action = consts.TRANSITIONS[state, ch]
       if new_state == ST.Invalid:
         raise AssertionError(
             'Invalid transition from %r with %r' % (state, ch))
@@ -399,7 +317,7 @@ def Split(self, s, allow_escape):
       state = new_state
       i += 1
 
-    last_action = LAST_SPAN_ACTION[state]
+    last_action = consts.LAST_SPAN_ACTION[state]
     #log('n %d state %s last_action %s', n, state, last_action)
 
     if last_action == EMIT.Part:

diff --git a/osh/word_eval.py b/osh/word_eval.py
@@ -6,42 +6,41 @@
 from _devbuild.gen.syntax_asdl import (
     braced_var_sub, Token,
     word, word_e, word_t, compound_word,
-    bracket_op_e, suffix_op_e, word_part_e,
-    bracket_op__ArrayIndex, bracket_op__WholeArray,
-    suffix_op__Nullary, suffix_op__PatSub, suffix_op__Slice,
-    suffix_op__Unary, sh_array_literal,
-    single_quoted, double_quoted, simple_var_sub, command_sub,
-    word_part__ArithSub, word_part__EscapedLiteral,
-    word_part__AssocArrayLiteral, word_part__ExprSub,
-    word_part__ExtGlob, word_part__FuncCall,
-    word_part__Splice, word_part__TildeSub,
+    bracket_op_e, bracket_op__ArrayIndex, bracket_op__WholeArray,
+    suffix_op_e, suffix_op__Nullary, suffix_op__PatSub, suffix_op__Slice,
+    suffix_op__Unary,
+    sh_array_literal, single_quoted, double_quoted, simple_var_sub,
+    command_sub,
+    word_part_e, word_part__ArithSub, word_part__EscapedLiteral,
+    word_part__AssocArrayLiteral, word_part__ExprSub, word_part__ExtGlob,
+    word_part__FuncCall, word_part__Splice, word_part__TildeSub,
 )
 from _devbuild.gen.runtime_asdl import (
     builtin_e, effect_e,
-    part_value, part_value_e, part_value_t, part_value__String, part_value__Array,
-    value, value_e, value_t, lvalue,
-    assign_arg, 
+    part_value, part_value_e, part_value_t, part_value__String,
+    part_value__Array,
+    value, value_e, value_t, value__Str, value__AssocArray,
+    value__MaybeStrArray, value__Obj,
+    lvalue, assign_arg, 
     cmd_value_e, cmd_value_t, cmd_value, cmd_value__Assign, cmd_value__Argv,
-    value__Str, value__AssocArray, value__MaybeStrArray, value__Obj,
     quote_e, quote_t,
 )
 from core import builtin_def
 from core import error
 from core import passwd
 from core import process
+from core import state
 from core.util import log, e_die, e_strict
 from frontend import consts
 from frontend import match
+from mycpp.mylib import tagswitch
+from mycpp import mylib
 from osh import braces
 from osh import glob_
 from osh import string_ops
-from core import state
 from osh import word_
 from osh import word_compile
 
-from mycpp.mylib import tagswitch
-from mycpp import mylib
-
 import posix_ as posix
 
 from typing import Optional, Tuple, List, Dict, cast, TYPE_CHECKING