[j8] Implement low-level shell encoding

With tests in j8_libc_test.c Start with Bourne shell '' strings, then fall back on bash $'' strings. TODO: - Wrap this in Python - Wrap this in C++ - Add wrapper with CanOmitQuotes()
oilshell · Jan 29, 2024 · 23756b8 · 23756b8
1 parent e2ce626
commit 23756b8
Show file tree

Hide file tree

Showing 5 changed files with 160 additions and 22 deletions.
diff --git a/data_lang/j8.h b/data_lang/j8.h
@@ -141,15 +141,111 @@ static inline int J8EncodeOne(unsigned char** p_in, unsigned char** p_out,
       break;
     }
   }
+  // Unreachable
+}
+
+// Like the above, but
+//
+//   \xff instead of \yff
+//   \u001f always, never \u{1f}
+//   No JSON vs. J8
+//     No \" escape ever
+//     No errors -- it can encode everything
+
+static inline void BashDollarEncodeOne(unsigned char** p_in,
+                                       unsigned char** p_out) {
+  unsigned char ch = **p_in;
 
   //
-  // Unreachable
+  // Handle \\ \b \f \n \r \t
   //
+  switch (ch) {
+  case '\\':
+    J8_OUT('\\');
+    J8_OUT('\\');
+    (*p_in)++;
+    return;
+  case '\b':
+    J8_OUT('\\');
+    J8_OUT('b');
+    (*p_in)++;
+    return;
+  case '\f':
+    J8_OUT('\\');
+    J8_OUT('f');
+    (*p_in)++;
+    return;
+  case '\n':
+    J8_OUT('\\');
+    J8_OUT('n');
+    (*p_in)++;
+    return;
+  case '\r':
+    J8_OUT('\\');
+    J8_OUT('r');
+    (*p_in)++;
+    return;
+  case '\t':
+    J8_OUT('\\');
+    J8_OUT('t');
+    (*p_in)++;
+    return;
+  case '\'':
+    J8_OUT('\\');
+    J8_OUT('\'');
+    (*p_in)++;
+    return;
+  }
+
+  //
+  // Unprintable ASCII control codes
+  //
+  if (ch < 0x20) {
+    // printf("Writing for %04x %p\n", ch, *p_out);
+    int n = sprintf((char*)*p_out, "\\u%04x", ch);
+    *p_out += n;
+    // printf("Wrote %d bytes for %04x\n", n, ch);
+    (*p_in)++;
+    return;
+  }
+
+  //
+  // UTF-8 encoded runes and invalid bytes
+  //
+  unsigned char* start = *p_in;  // save start position
+  uint32_t codepoint = 0;
+  uint32_t state = UTF8_ACCEPT;
+
+  while (1) {
+    decode(&state, &codepoint, ch);
+    // printf("  state %d\n", state);
+    switch (state) {
+    case UTF8_REJECT: {
+      (*p_in)++;
+      int n = sprintf((char*)*p_out, "\\x%2x", ch);
+      *p_out += n;
+      return;
+    }
+    case UTF8_ACCEPT: {
+      (*p_in)++;
+      // printf("start %p p_in %p\n", start, *p_in);
+      while (start < *p_in) {
+        J8_OUT(*start);
+        start++;
+      }
+      return;
+    }
+    default:
+      (*p_in)++;  // advance, next UTF8_ACCEPT will write it
+      ch = **p_in;
+      break;
+    }
+  }
+  // Unreachable
 }
 
-// Like the above, but
+// BourneShellEncodeOne rules:
 //
-// escape_style == 0 - try shell 'foo'
 //   must be valid UTF-8
 //   no control chars
 //   no ' is required
@@ -158,17 +254,45 @@ static inline int J8EncodeOne(unsigned char** p_in, unsigned char** p_out,
 // For example we write $'\\' or b'\\' not '\'
 // The latter should be written r'\', but we're not outputing
 
-#define STYLE_SQ 0         // 'foo'
-#define STYLE_DOLLAR_SQ 1  // $'\xff'
-#define STYLE_B_STRING 2   // b'\yff'
+static inline int BourneShellEncodeOne(unsigned char** p_in,
+                                       unsigned char** p_out) {
+  unsigned char ch = **p_in;
 
-// escape_style == 1 means $'\xff'
-//
-// escape_style == 2 means b'\yff' I think?
+  if (ch == '\'' || ch == '\\') {  // can't encode these in Bourne shell ''
+    return 1;
+  }
+  if (ch < 0x20) {  // Unprintable ASCII control codes
+    return 1;
+  }
+
+  // UTF-8 encoded runes and invalid bytes
+  unsigned char* start = *p_in;  // save start position
+  uint32_t codepoint = 0;
+  uint32_t state = UTF8_ACCEPT;
 
-static inline int ShellEncodeOne(unsigned char** p_in, unsigned char** p_out,
-                                 int escape_style) {
-  J8EncodeOne(p_in, p_out, 1);
+  while (1) {
+    decode(&state, &codepoint, ch);
+    // printf("  state %d\n", state);
+    switch (state) {
+    case UTF8_REJECT: {
+      return 1;
+    }
+    case UTF8_ACCEPT: {
+      (*p_in)++;
+      // printf("start %p p_in %p\n", start, *p_in);
+      while (start < *p_in) {
+        J8_OUT(*start);
+        start++;
+      }
+      return 0;
+    }
+    default:
+      (*p_in)++;  // advance, next UTF8_ACCEPT will write it
+      ch = **p_in;
+      break;
+    }
+  }
+  // Unreachable
 }
 
 // Right now \u001f and \u{1f} are the longest output sequences for a byte.
@@ -190,11 +314,20 @@ static inline int J8EncodeChunk(unsigned char** p_in, unsigned char* in_end,
   return 0;
 }
 
-inline int ShellEncodeChunk(unsigned char** p_in, unsigned char* in_end,
-                            unsigned char** p_out, unsigned char* out_end,
-                            int escape_style) {
+inline int BashDollarEncodeChunk(unsigned char** p_in, unsigned char* in_end,
+                                 unsigned char** p_out,
+                                 unsigned char* out_end) {
+  while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
+    BashDollarEncodeOne(p_in, p_out);
+  }
+  return 0;
+}
+
+inline int BourneShellEncodeChunk(unsigned char** p_in, unsigned char* in_end,
+                                  unsigned char** p_out,
+                                  unsigned char* out_end) {
   while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
-    int cannot_encode = ShellEncodeOne(p_in, p_out, escape_style);
+    int cannot_encode = BourneShellEncodeOne(p_in, p_out);
     if (cannot_encode) {     // we need escaping, e.g. \u0001 or \'
       return cannot_encode;  // early return
     }

diff --git a/data_lang/j8_libc.c b/data_lang/j8_libc.c
@@ -62,7 +62,7 @@ void EncodeBashDollarString(j8_buf_t in_buf, j8_buf_t* out_buf, int capacity) {
     // printf("B iter %p < %p and %p < %p < %p\n", in, in_end, out_buf->data,
     // out, out_end);
     // Fill as much as we can
-    ShellEncodeChunk(&in, in_end, &out, out_end, STYLE_DOLLAR_SQ);
+    BashDollarEncodeChunk(&in, in_end, &out, out_end);
     out_buf->len = out - out_buf->data;  // recompute length
 
     if (in >= in_end) {
@@ -173,13 +173,13 @@ void ShellEncodeString(j8_buf_t in_buf, j8_buf_t* out_buf, int escape_style) {
   unsigned char* out_end = out_buf->data + capacity;
   unsigned char** p_out = &out;
 
-  J8_OUT('"');
+  J8_OUT('\'');
 
   while (true) {
     // Fill in as much as we can
     // printf("J8 iter %p < %p and %p < %p < %p\n", in, in_end, out_buf->data,
     // out, out_end);
-    int cannot_encode = ShellEncodeChunk(&in, in_end, &out, out_end, STYLE_SQ);
+    int cannot_encode = BourneShellEncodeChunk(&in, in_end, &out, out_end);
     if (cannot_encode) {
       out_buf->len = 0;  // rewind to begining
       // printf("out %p out_end %p capacity %d\n", out, out_end, capacity);
@@ -214,7 +214,7 @@ void ShellEncodeString(j8_buf_t in_buf, j8_buf_t* out_buf, int escape_style) {
     // printf("[1] out %p out_end %p\n", out, out_end);
   }
 
-  J8_OUT('"');
+  J8_OUT('\'');
   out_buf->len = out - out_buf->data;
 
   J8_OUT('\0');  // NUL terminate for printf

diff --git a/data_lang/j8_libc.h b/data_lang/j8_libc.h
@@ -23,6 +23,9 @@ typedef struct j8_buf_t {
 
 void J8EncodeString(j8_buf_t in_buf, j8_buf_t* out_buf, int j8_fallback);
 
+#define STYLE_DOLLAR_SQ 1  // $'\xff'
+#define STYLE_B_STRING 2   // b'\yff'
+
 void ShellEncodeString(j8_buf_t in_buf, j8_buf_t* out_buf, int escape_style);
 
 #endif  // DATA_LANG_J8_LIBC_H
diff --git a/data_lang/j8_libc_test.c b/data_lang/j8_libc_test.c
@@ -24,7 +24,7 @@ TEST char_int_test() {
   PASS();
 }
 
-TEST encode_test() {
+TEST j8_encode_test() {
   for (int i = 0; J8_TEST_CASES[i]; ++i) {
     const char* s = J8_TEST_CASES[i];
     int input_len = strlen(s);
@@ -128,7 +128,7 @@ GREATEST_MAIN_DEFS();
 int main(int argc, char** argv) {
   GREATEST_MAIN_BEGIN();
 
-  RUN_TEST(encode_test);
+  RUN_TEST(j8_encode_test);
   RUN_TEST(shell_encode_test);
   RUN_TEST(char_int_test);
   RUN_TEST(can_omit_quotes_test);

diff --git a/data_lang/j8_test_lib.c b/data_lang/j8_test_lib.c
@@ -7,5 +7,7 @@ const char* J8_TEST_CASES[] = {
     "foozz abcd \xfe \x1f",
     "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0e\x0f\x10\xfe",
     "\xff\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0e\x0f\x10\xfe",
+    "C:\\Program Files\\",
+    "Fool's Gold",  // single quote
     NULL,
 };