[j8] declare -p uses new fastfunc.ShellEncodeString()

Wrote C++ functions that write to BufWriter(). Though we don't need BufWriter right now, it's easy to follow the style of J8EncodeString(). TODO: - these functions can be moved to j8_lite.py - use j8_lite.py everywhere, and then QSN can be deleted. - qsn.maybe_shell_encode() -> j8_lite.MaybeShellEncode()
oilshell · Jan 29, 2024 · dc5bb5d · dc5bb5d
1 parent 80c82ca
commit dc5bb5d
Show file tree

Hide file tree

Showing 5 changed files with 199 additions and 53 deletions.
diff --git a/cpp/data_lang.cc b/cpp/data_lang.cc
@@ -8,52 +8,52 @@
 // TODO: remove duplication
 #define LOSSY_JSON (1 << 3)
 
-namespace fastfunc {
-
-bool CanOmitQuotes(BigStr* s) {
-  return ::CanOmitQuotes(reinterpret_cast<unsigned char*>(s->data_), len(s));
-}
+namespace {
 
-BigStr* J8EncodeString(BigStr* s, int j8_fallback) {
-  auto buf = Alloc<mylib::BufWriter>();
-  int options = j8_fallback ? 0 : LOSSY_JSON;
-  pyj8::WriteString(s, options, buf);
-  return buf->getvalue();
-}
+void WriteBString(BigStr* s, mylib::BufWriter* buf, int capacity) {
+  uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
+  uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
 
-}  // namespace fastfunc
+  buf->WriteConst("b'");
 
-namespace pyj8 {
+  // Set up pointers after writing opening quote
+  uint8_t* out = buf->LengthPointer();  // mutated
+  uint8_t* out_end = buf->CapacityPointer();
 
-bool PartIsUtf8(BigStr* s, int start, int end) {
-  uint32_t codepoint;
-  uint32_t state = UTF8_ACCEPT;
+  while (true) {
+    J8EncodeChunk(&in, in_end, &out, out_end, true);  // Fill as much as we can
+    buf->SetLengthFrom(out);
 
-  for (int i = start; i < end; ++i) {
-    // This var or a static_cast<> is necessary.  Should really change BigStr*
-    // to use unsigned type
-    uint8_t c = s->data_[i];
-    decode(&state, &codepoint, c);
-    if (state == UTF8_REJECT) {
-      return false;
+    if (in >= in_end) {
+      break;
     }
+
+    // Same growth policy as below
+    capacity = capacity * 3 / 2;
+    // printf("[2] new capacity %d\n", capacity);
+    buf->EnsureMoreSpace(capacity);
+
+    // Recompute pointers
+    out = buf->LengthPointer();
+    out_end = buf->CapacityPointer();
   }
 
-  return state == UTF8_ACCEPT;
+  buf->WriteConst("'");
 }
 
-void WriteBString(BigStr* s, mylib::BufWriter* buf, int capacity) {
+void WriteBashDollarString(BigStr* s, mylib::BufWriter* buf, int capacity) {
   uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
   uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
 
-  buf->WriteConst("b'");
+  buf->WriteConst("$'");
 
   // Set up pointers after writing opening quote
   uint8_t* out = buf->LengthPointer();  // mutated
   uint8_t* out_end = buf->CapacityPointer();
 
   while (true) {
-    J8EncodeChunk(&in, in_end, &out, out_end, true);  // Fill as much as we can
+    BashDollarEncodeChunk(&in, in_end, &out,
+                          out_end);  // Fill as much as we can
     buf->SetLengthFrom(out);
 
     if (in >= in_end) {
@@ -73,6 +73,114 @@ void WriteBString(BigStr* s, mylib::BufWriter* buf, int capacity) {
   buf->WriteConst("'");
 }
 
+// Style is COPIED from pyj8::WriteString()
+// Functionality is like j8_libc.c ShellEncodeString, that is:
+//
+// call BourneShellEncodeChunk()
+// then either
+//   WriteBString()
+//   WriteBashDollarString()
+
+void ShellEncodeString(BigStr* s, int ysh_fallback, mylib::BufWriter* buf) {
+  uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
+  uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
+
+  // Growth policy: Start at a fixed size min(N + 3 + 2, 16)
+  int capacity = len(s) + 3 + 2;  // 3 for quotes, 2 potential \" \n
+  if (capacity < 16) {            // account for J8_MAX_BYTES_PER_INPUT_BYTE
+    capacity = 16;
+  }
+  // printf("[1] capacity %d\n", capacity);
+
+  buf->EnsureMoreSpace(capacity);
+
+  int begin = buf->Length();  // maybe Truncate to this position
+  buf->WriteConst("'");
+
+  // Set up pointers after writing opening quote
+  uint8_t* out = buf->LengthPointer();  // mutated
+  uint8_t* out_end = buf->CapacityPointer();
+
+  while (true) {
+    // Fill in as much as we can
+    int cannot_encode = BourneShellEncodeChunk(&in, in_end, &out, out_end);
+    if (cannot_encode) {
+      buf->Truncate(begin);
+      if (ysh_fallback) {
+        WriteBString(s, buf, capacity);  // fall back to b''
+      } else {
+        WriteBashDollarString(s, buf, capacity);  // fall back to $''
+      }
+      return;
+    }
+    buf->SetLengthFrom(out);
+
+    // printf("[1] len %d\n", out_buf->len);
+
+    if (in >= in_end) {
+      break;
+    }
+
+    // Growth policy: every time through the loop, increase 1.5x
+    //
+    // The worst blowup is 6x, and 1.5 ** 5 > 6, so it will take 5 reallocs.
+    // This seems like a reasonable tradeoff between over-allocating and too
+    // many realloc().
+    capacity = capacity * 3 / 2;
+    // printf("[1] new capacity %d\n", capacity);
+    buf->EnsureMoreSpace(capacity);
+
+    // Recompute pointers
+    out = buf->LengthPointer();  // mutated
+    out_end = buf->CapacityPointer();
+    // printf("[1] out %p out_end %p\n", out, out_end);
+  }
+
+  buf->WriteConst("'");
+}
+
+}  // namespace
+
+namespace fastfunc {
+
+bool CanOmitQuotes(BigStr* s) {
+  return ::CanOmitQuotes(reinterpret_cast<unsigned char*>(s->data_), len(s));
+}
+
+BigStr* J8EncodeString(BigStr* s, int j8_fallback) {
+  auto buf = Alloc<mylib::BufWriter>();
+  int options = j8_fallback ? 0 : LOSSY_JSON;
+  pyj8::WriteString(s, options, buf);
+  return buf->getvalue();
+}
+
+BigStr* ShellEncodeString(BigStr* s, int ysh_fallback) {
+  auto buf = Alloc<mylib::BufWriter>();
+  ::ShellEncodeString(s, ysh_fallback, buf);
+  return buf->getvalue();
+}
+
+}  // namespace fastfunc
+
+namespace pyj8 {
+
+bool PartIsUtf8(BigStr* s, int start, int end) {
+  uint32_t codepoint;
+  uint32_t state = UTF8_ACCEPT;
+
+  for (int i = start; i < end; ++i) {
+    // This var or a static_cast<> is necessary.  Should really change BigStr*
+    // to use unsigned type
+    uint8_t c = s->data_[i];
+    decode(&state, &codepoint, c);
+    if (state == UTF8_REJECT) {
+      return false;
+    }
+  }
+
+  return state == UTF8_ACCEPT;
+}
+
 void WriteString(BigStr* s, int options, mylib::BufWriter* buf) {
   bool j8_fallback = !(options & LOSSY_JSON);
 

diff --git a/cpp/data_lang.h b/cpp/data_lang.h
@@ -10,6 +10,8 @@ bool CanOmitQuotes(BigStr* s);
 
 BigStr* J8EncodeString(BigStr* s, int j8_fallback);
 
+BigStr* ShellEncodeString(BigStr* s, int ysh_fallback);
+
 }  // namespace fastfunc
 
 namespace pyj8 {

diff --git a/data_lang/j8.py b/data_lang/j8.py
@@ -4,14 +4,6 @@
 
 TODO:
 
-- Translate the whole thing to C++
-  - use Bjoern DFA for UTF-8 validation in printing and parsing
-  - move more of LexerDecoder out of pyj8.py?  I think it can translate
-
-- Remove most of QSN
-  - QSN maybe_shell_encode() is used for bash features
-  - Remove shell_compat which does \\x00 instead of \\0
-
 - Many more tests
   - Run JSONTestSuite
 
@@ -22,12 +14,7 @@
   - line wrapping -- do this later
   - would like CONTRIBUTORS here
 
-- Harmonize the API in data_lang/qsn.py 
-  - use mylib.BufWriter output
-  - use u_style.LiteralUtf8 instead of BIT8_UTF8, etc.
-
 - Unify with ASDL pretty printing?
-
    {} [] are for JSON?
    () is for statically typed ASDL data?
       (command.Simple blame_tok:(...) words:[ ])
@@ -42,12 +29,13 @@
 from core import error
 from core import vm
 from data_lang import pyj8
-from data_lang import qsn
 from frontend import consts
 from frontend import match
 from mycpp import mylib
 from mycpp.mylib import tagswitch, iteritems, NewDict, log
 
+import fastfunc
+
 _ = log
 
 from typing import cast, Dict, List, Tuple, Optional
@@ -62,23 +50,27 @@
 
 def MaybeShellEncode(s):
     # type: (str) -> str
-    return qsn.maybe_shell_encode(s)
+    """
+    This is like ShellEncode(s, unquoted_ok=True)
+    But it's common, so we give it a shorter name.
+    """
+    if fastfunc.CanOmitQuotes(s):
+        return s
+
+    return fastfunc.ShellEncodeString(s, 0)  # no ysh_fallback
 
 
 def ShellEncode(s):
     # type: (str) -> str
+    return fastfunc.ShellEncodeString(s, 0)  # no ysh_fallback
 
-    # TODO: call fastfunc.ShellEncodeString()
-    return qsn.maybe_shell_encode(s, flags=qsn.MUST_QUOTE)
 
+def YshEncode(s, unquoted_ok=False):
+    # type: (str, bool) -> str
+    if unquoted_ok and fastfunc.CanOmitQuotes(s):
+        return s
 
-def YshEncode(s):
-    # type: (str) -> str
-
-    # TODO: call fastfunc.ShellEncodeString(STYLE_B_STRING)
-    #
-    # ysh_fallback -- b'' style
-    return qsn.maybe_shell_encode(s, flags=qsn.MUST_QUOTE)
+    return fastfunc.ShellEncodeString(s, 1)  # ysh_fallback
 
 
 class Printer(object):

diff --git a/data_lang/j8_lite.py b/data_lang/j8_lite.py
@@ -15,7 +15,6 @@ def EncodeString(s, unquoted_ok=False):
     that method, then call BufWriter.clear() in between.
     """
     if unquoted_ok and fastfunc.CanOmitQuotes(s):
-        #if unquoted_ok and fastlex.CanOmitQuotes(s):
         return s
 
     return fastfunc.J8EncodeString(s, 1)  # j8_fallback is true
diff --git a/spec/assign-extended.test.sh b/spec/assign-extended.test.sh
@@ -220,6 +220,51 @@ typeset -x test_var3=333
 typeset test_var5=555
 ## END
 
+#### declare -p doesn't print binary data, but can be loaded into bash
+
+# bash prints binary data!
+case $SH in bash|mksh) exit ;; esac
+
+unquoted='foo'
+sq='foo bar'
+bash1=$'\x1f'  # ASCII control char
+bash2=$'\xfe\xff'  # Invalid UTF-8
+
+s1=$unquoted
+s2=$sq
+s3=$bash1
+s4=$bash2
+
+declare -a a=("$unquoted" "$sq" "$bash1" "$bash2")
+declare -A A=(["$unquoted"]="$sq" ["$bash1"]="$bash2")
+
+#echo lengths ${#s1} ${#s2} ${#s3} ${#s4} ${#a[@]} ${#A[@]}
+
+declare -p s1 s2 s3 s4 a A | tee tmp.bash
+
+echo ---
+
+bash -c 'source tmp.bash; echo "$s1 $s2"; echo -n "$s3" "$s4" | od -A n -t x1'
+echo bash=$?
+
+## STDOUT:
+declare -- s1=foo
+declare -- s2='foo bar'
+declare -- s3=$'\u001f'
+declare -- s4=$'\xfe\xff'
+declare -a a=(foo 'foo bar' $'\u001f' $'\xfe\xff')
+declare -A A=([$'\u001f']=$'\xfe\xff' ['foo']='foo bar')
+---
+foo foo bar
+ 1f 20 fe ff
+bash=0
+## END
+
+## N-I bash/mksh STDOUT:
+## END
+
+
+
 #### declare -p var
 # BUG? bash doesn't output anything for 'local/readonly -p var', which seems to
 #   contradict with manual.  Besides, 'export -p var' is not described in
@@ -388,7 +433,7 @@ declare -A test_var7=()
 
 #### declare -pnrx var
 # Note: Bash ignores other flags (-nrx) when variable names are supplied while
-#   Oil uses other flags to select variables.  Bash's behavior is documented.
+#   OSH uses other flags to select variables.  Bash's behavior is documented.
 test_var1=111
 readonly test_var2=222
 export test_var3=333