[j8] Start implementing ShellEncodeString()

Set up some unit tests. Right now the inner loop calls J8EncodeOne().
oilshell · Jan 29, 2024 · e2ce626 · e2ce626
1 parent e4a1841
commit e2ce626
Show file tree

Hide file tree

Showing 6 changed files with 208 additions and 45 deletions.
diff --git a/data_lang/j8.h b/data_lang/j8.h
@@ -147,6 +147,30 @@ static inline int J8EncodeOne(unsigned char** p_in, unsigned char** p_out,
   //
 }
 
+// Like the above, but
+//
+// escape_style == 0 - try shell 'foo'
+//   must be valid UTF-8
+//   no control chars
+//   no ' is required
+//   no \ -- not required, but avoids ambiguous '\n'
+//
+// For example we write $'\\' or b'\\' not '\'
+// The latter should be written r'\', but we're not outputing
+
+#define STYLE_SQ 0         // 'foo'
+#define STYLE_DOLLAR_SQ 1  // $'\xff'
+#define STYLE_B_STRING 2   // b'\yff'
+
+// escape_style == 1 means $'\xff'
+//
+// escape_style == 2 means b'\yff' I think?
+
+static inline int ShellEncodeOne(unsigned char** p_in, unsigned char** p_out,
+                                 int escape_style) {
+  J8EncodeOne(p_in, p_out, 1);
+}
+
 // Right now \u001f and \u{1f} are the longest output sequences for a byte.
 // Bug fix: we need 6 + 1 for the NUL terminator that sprintf() writes!  (Even
 // though we don't technically need it)
@@ -166,17 +190,13 @@ static inline int J8EncodeChunk(unsigned char** p_in, unsigned char* in_end,
   return 0;
 }
 
-// TODO: $'\x00\u1234' escaping
-inline int ShellEncodeOne(unsigned char** p_in, unsigned char** p_out) {
-  return 0;
-}
-
 inline int ShellEncodeChunk(unsigned char** p_in, unsigned char* in_end,
-                            unsigned char** p_out, unsigned char* out_end) {
+                            unsigned char** p_out, unsigned char* out_end,
+                            int escape_style) {
   while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
-    int dollar_fallback = ShellEncodeOne(p_in, p_out);
-    if (dollar_fallback) {     // we need escaping, e.g. \u0001 or \'
-      return dollar_fallback;  // early return
+    int cannot_encode = ShellEncodeOne(p_in, p_out, escape_style);
+    if (cannot_encode) {     // we need escaping, e.g. \u0001 or \'
+      return cannot_encode;  // early return
     }
   }
   return 0;

diff --git a/data_lang/j8.py b/data_lang/j8.py
@@ -83,16 +83,8 @@ class Printer(object):
 
     def __init__(self):
         # type: () -> None
-        """
-        Args:
-          # These can all be packed into the same byte.  ASDL needs bit_set
-          # support I think?
-          options:
-            control j"" vs. ""
-            control escaping \\x and \\u escaping like QSN
-            pretty.UnquotedKeys - ASDL uses this?
-        """
-        self.options = 0
+
+        # TODO: should remove this in favor of BufWriter method
         self.spaces = {0: ''}  # cache of strings with spaces
 
     # Could be PrintMessage or PrintJsonMessage()

diff --git a/data_lang/j8_libc.c b/data_lang/j8_libc.c
@@ -44,6 +44,48 @@ void EncodeBString(j8_buf_t in_buf, j8_buf_t* out_buf, int capacity) {
   J8_OUT('\0');  // NUL terminate for printf
 }
 
+// $'' escaping
+// This function is a COPY of EncodeBString() above
+void EncodeBashDollarString(j8_buf_t in_buf, j8_buf_t* out_buf, int capacity) {
+  // Compute pointers for the inner loop
+  unsigned char* in = (unsigned char*)in_buf.data;
+  unsigned char* in_end = in + in_buf.len;
+
+  unsigned char* out = out_buf->data;  // mutated
+  unsigned char* out_end = out_buf->data + capacity;
+  unsigned char** p_out = &out;
+
+  J8_OUT('$');  // Left quote b''
+  J8_OUT('\'');
+
+  while (true) {
+    // printf("B iter %p < %p and %p < %p < %p\n", in, in_end, out_buf->data,
+    // out, out_end);
+    // Fill as much as we can
+    ShellEncodeChunk(&in, in_end, &out, out_end, STYLE_DOLLAR_SQ);
+    out_buf->len = out - out_buf->data;  // recompute length
+
+    if (in >= in_end) {
+      break;
+    }
+
+    // Same growth policy as below
+    capacity = capacity * 3 / 2;
+    // printf("[2] new capacity %d\n", capacity);
+    out_buf->data = (unsigned char*)realloc(out_buf->data, capacity);
+
+    // Recompute pointers
+    out = out_buf->data + out_buf->len;
+    out_end = out_buf->data + capacity;
+    p_out = &out;
+  }
+
+  J8_OUT('\'');
+  out_buf->len = out - out_buf->data;
+
+  J8_OUT('\0');  // NUL terminate for printf
+}
+
 void J8EncodeString(j8_buf_t in_buf, j8_buf_t* out_buf, int j8_fallback) {
   unsigned char* in = (unsigned char*)in_buf.data;
   unsigned char* in_end = in + in_buf.len;
@@ -105,10 +147,75 @@ void J8EncodeString(j8_buf_t in_buf, j8_buf_t* out_buf, int j8_fallback) {
   J8_OUT('\0');  // NUL terminate for printf
 }
 
-// $'' escaping, very similar to J8
-void BashDollarEncodeString(j8_buf_t in_buf, j8_buf_t* out_buf) {
-}
-
 // Start with '', but fall back on $'' for ASCII control and \'
-void ShellEncodeString(j8_buf_t in_buf, j8_buf_t* out_buf) {
+//
+// Depending on options, fall back to
+//
+// EncodeBashDollarString() -- $'\xff'
+// EncodeBString()          -- b'\yff'
+
+// Mostly a COPY of the above
+void ShellEncodeString(j8_buf_t in_buf, j8_buf_t* out_buf, int escape_style) {
+  unsigned char* in = (unsigned char*)in_buf.data;
+  unsigned char* in_end = in + in_buf.len;
+
+  // Growth policy: Start at a fixed size min(N + 3 + 2, 16)
+  int capacity = in_buf.len + 3 + 2;  // 3 for quotes, 2 potential \" \n
+  if (capacity < 16) {                // account for J8_MAX_BYTES_PER_INPUT_BYTE
+    capacity = 16;
+  }
+  // printf("[1] capacity %d j8_fallback %d\n", capacity, j8_fallback);
+
+  out_buf->data = (unsigned char*)malloc(capacity);
+  out_buf->len = 0;  // starts out empty
+
+  unsigned char* out = out_buf->data;  // mutated
+  unsigned char* out_end = out_buf->data + capacity;
+  unsigned char** p_out = &out;
+
+  J8_OUT('"');
+
+  while (true) {
+    // Fill in as much as we can
+    // printf("J8 iter %p < %p and %p < %p < %p\n", in, in_end, out_buf->data,
+    // out, out_end);
+    int cannot_encode = ShellEncodeChunk(&in, in_end, &out, out_end, STYLE_SQ);
+    if (cannot_encode) {
+      out_buf->len = 0;  // rewind to begining
+      // printf("out %p out_end %p capacity %d\n", out, out_end, capacity);
+      if (escape_style == STYLE_DOLLAR_SQ) {
+        EncodeBashDollarString(in_buf, out_buf, capacity);  // fall back to $''
+      } else {
+        EncodeBString(in_buf, out_buf, capacity);  // fall back to b''
+      }
+      // printf("len %d\n", out_buf->len);
+      return;
+    }
+    out_buf->len = out - out_buf->data;  // recompute length
+    // printf("[1] len %d\n", out_buf->len);
+
+    if (in >= in_end) {
+      break;
+    }
+
+    // Growth policy: every time through the loop, increase 1.5x
+    //
+    // The worst blowup is 6x, and 1.5 ** 5 > 6, so it will take 5 reallocs.
+    // This seems like a reasonable tradeoff between over-allocating and too
+    // many realloc().
+    capacity = capacity * 3 / 2;
+    // printf("[1] new capacity %d\n", capacity);
+    out_buf->data = (unsigned char*)realloc(out_buf->data, capacity);
+
+    // Recompute pointers
+    out = out_buf->data + out_buf->len;
+    out_end = out_buf->data + capacity;
+    p_out = &out;
+    // printf("[1] out %p out_end %p\n", out, out_end);
+  }
+
+  J8_OUT('"');
+  out_buf->len = out - out_buf->data;
+
+  J8_OUT('\0');  // NUL terminate for printf
 }
diff --git a/data_lang/j8_libc.h b/data_lang/j8_libc.h
@@ -23,6 +23,6 @@ typedef struct j8_buf_t {
 
 void J8EncodeString(j8_buf_t in_buf, j8_buf_t* out_buf, int j8_fallback);
 
-void ShellEncodeString(j8_buf_t in_buf, j8_buf_t* out_buf);
+void ShellEncodeString(j8_buf_t in_buf, j8_buf_t* out_buf, int escape_style);
 
 #endif  // DATA_LANG_J8_LIBC_H
diff --git a/data_lang/j8_libc_test.c b/data_lang/j8_libc_test.c
@@ -67,6 +67,49 @@ TEST encode_test() {
   PASS();
 }
 
+TEST shell_encode_test() {
+  for (int i = 0; J8_TEST_CASES[i]; ++i) {
+    const char* s = J8_TEST_CASES[i];
+    int input_len = strlen(s);
+    j8_buf_t in = {(unsigned char*)s, input_len};
+
+    // printf("input '%s' %d\n", in.data, input_len);
+
+    j8_buf_t result = {0};
+    ShellEncodeString(in, &result, STYLE_DOLLAR_SQ);
+
+    printf("result %s\n", result.data);
+    printf("result.len %d\n", result.len);
+
+    // Some sanity checks
+    int n = strlen(s);
+    switch (n) {
+    case 0:  // empty string -> ""
+      ASSERT_EQ_FMT(2, result.len, "%d");
+      break;
+    case 1:  // x -> "x"
+      ASSERT_EQ_FMT(3, result.len, "%d");
+      break;
+    default:
+      ASSERT(input_len < result.len);
+      break;
+    }
+    free(result.data);
+
+    // Encode again with J8 fallback
+    result = {0};
+    ShellEncodeString(in, &result, STYLE_B_STRING);
+
+    printf("result %s\n", result.data);
+    printf("result.len %d\n", result.len);
+    free(result.data);
+
+    printf("\n");
+  }
+
+  PASS();
+}
+
 TEST can_omit_quotes_test() {
   const char* s = "foo";
   ASSERT(CanOmitQuotes((unsigned char*)s, strlen(s)));
@@ -86,6 +129,7 @@ int main(int argc, char** argv) {
   GREATEST_MAIN_BEGIN();
 
   RUN_TEST(encode_test);
+  RUN_TEST(shell_encode_test);
   RUN_TEST(char_int_test);
   RUN_TEST(can_omit_quotes_test);
 

diff --git a/data_lang/j8_test.cc b/data_lang/j8_test.cc
@@ -140,7 +140,7 @@ void EncodeString(char* s, int n, std::string* result, int j8_fallback) {
       // printf("RETRY\n");
       result->erase(begin_index, std::string::npos);
       EncodeBString(s, n, result);  // fall back to b''
-      printf("\t[1] result len %d\n", result->size());
+      printf("\t[1] result len %d\n", static_cast<int>(result->size()));
       return;
     }
 
@@ -152,7 +152,7 @@ void EncodeString(char* s, int n, std::string* result, int j8_fallback) {
     result->erase(end_index, std::string::npos);
   }
   result->append("\"");
-  printf("\t[1] result len %d\n", result->size());
+  printf("\t[1] result len %d\n", static_cast<int>(result->size()));
 }
 
 void EncodeAndPrint(char* s, int n, int j8_fallback) {
@@ -177,33 +177,33 @@ void EncodeAndPrint(char* s, int n, int j8_fallback) {
 
 TEST encode_test() {
 #if 1
-  char* mixed = "hi \x01 \u4000\xfe\u4001\xff\xfd ' \" new \n \\ \u03bc";
-  EncodeAndPrint(mixed, strlen(mixed), 0);
-  EncodeAndPrint(mixed, strlen(mixed), 1);
+  const char* mixed = "hi \x01 \u4000\xfe\u4001\xff\xfd ' \" new \n \\ \u03bc";
+  EncodeAndPrint(const_cast<char*>(mixed), strlen(mixed), 0);
+  EncodeAndPrint(const_cast<char*>(mixed), strlen(mixed), 1);
 #endif
 
-  char* a = "ab";
-  EncodeAndPrint(a, strlen(a), 0);
-  EncodeAndPrint(a, strlen(a), 1);
+  const char* a = "ab";
+  EncodeAndPrint(const_cast<char*>(a), strlen(a), 0);
+  EncodeAndPrint(const_cast<char*>(a), strlen(a), 1);
 
-  char* b = "0123456789";
-  EncodeAndPrint(b, strlen(b), 0);
-  EncodeAndPrint(b, strlen(b), 1);
+  const char* b = "0123456789";
+  EncodeAndPrint(const_cast<char*>(b), strlen(b), 0);
+  EncodeAndPrint(const_cast<char*>(b), strlen(b), 1);
 
-  char* u = "hi \u4000 \u03bc";
-  EncodeAndPrint(u, strlen(u), 0);
-  EncodeAndPrint(u, strlen(u), 1);
+  const char* u = "hi \u4000 \u03bc";
+  EncodeAndPrint(const_cast<char*>(b), strlen(u), 0);
+  EncodeAndPrint(const_cast<char*>(b), strlen(u), 1);
 
   // Internal NUL
-  char* bin = "\x00\x01\xff";
-  EncodeAndPrint(bin, 3, 0);
-  EncodeAndPrint(bin, 3, 1);
+  const char* bin = "\x00\x01\xff";
+  EncodeAndPrint(const_cast<char*>(bin), 3, 0);
+  EncodeAndPrint(const_cast<char*>(bin), 3, 1);
 
   // Blow up size
-  char* blowup =
+  const char* blowup =
       "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0e\x0f\x10\xfe";
-  EncodeAndPrint(blowup, strlen(blowup), 0);
-  EncodeAndPrint(blowup, strlen(blowup), 1);
+  EncodeAndPrint(const_cast<char*>(blowup), strlen(blowup), 0);
+  EncodeAndPrint(const_cast<char*>(blowup), strlen(blowup), 1);
 
   PASS();
 }