Skip to content

Commit

Permalink
[j8] Implement low-level shell encoding
Browse files Browse the repository at this point in the history
With tests in j8_libc_test.c

Start with Bourne shell '' strings, then fall back on bash $'' strings.

TODO:

- Wrap this in Python
- Wrap this in C++
- Add wrapper with CanOmitQuotes()
  • Loading branch information
Andy C committed Jan 29, 2024
1 parent e2ce626 commit 23756b8
Show file tree
Hide file tree
Showing 5 changed files with 160 additions and 22 deletions.
165 changes: 149 additions & 16 deletions data_lang/j8.h
Expand Up @@ -141,15 +141,111 @@ static inline int J8EncodeOne(unsigned char** p_in, unsigned char** p_out,
break;
}
}
// Unreachable
}

// Like the above, but
//
// \xff instead of \yff
// \u001f always, never \u{1f}
// No JSON vs. J8
// No \" escape ever
// No errors -- it can encode everything

static inline void BashDollarEncodeOne(unsigned char** p_in,
unsigned char** p_out) {
unsigned char ch = **p_in;

//
// Unreachable
// Handle \\ \b \f \n \r \t
//
switch (ch) {
case '\\':
J8_OUT('\\');
J8_OUT('\\');
(*p_in)++;
return;
case '\b':
J8_OUT('\\');
J8_OUT('b');
(*p_in)++;
return;
case '\f':
J8_OUT('\\');
J8_OUT('f');
(*p_in)++;
return;
case '\n':
J8_OUT('\\');
J8_OUT('n');
(*p_in)++;
return;
case '\r':
J8_OUT('\\');
J8_OUT('r');
(*p_in)++;
return;
case '\t':
J8_OUT('\\');
J8_OUT('t');
(*p_in)++;
return;
case '\'':
J8_OUT('\\');
J8_OUT('\'');
(*p_in)++;
return;
}

//
// Unprintable ASCII control codes
//
if (ch < 0x20) {
// printf("Writing for %04x %p\n", ch, *p_out);
int n = sprintf((char*)*p_out, "\\u%04x", ch);
*p_out += n;
// printf("Wrote %d bytes for %04x\n", n, ch);
(*p_in)++;
return;
}

//
// UTF-8 encoded runes and invalid bytes
//
unsigned char* start = *p_in; // save start position
uint32_t codepoint = 0;
uint32_t state = UTF8_ACCEPT;

while (1) {
decode(&state, &codepoint, ch);
// printf(" state %d\n", state);
switch (state) {
case UTF8_REJECT: {
(*p_in)++;
int n = sprintf((char*)*p_out, "\\x%2x", ch);
*p_out += n;
return;
}
case UTF8_ACCEPT: {
(*p_in)++;
// printf("start %p p_in %p\n", start, *p_in);
while (start < *p_in) {
J8_OUT(*start);
start++;
}
return;
}
default:
(*p_in)++; // advance, next UTF8_ACCEPT will write it
ch = **p_in;
break;
}
}
// Unreachable
}

// Like the above, but
// BourneShellEncodeOne rules:
//
// escape_style == 0 - try shell 'foo'
// must be valid UTF-8
// no control chars
// no ' is required
Expand All @@ -158,17 +254,45 @@ static inline int J8EncodeOne(unsigned char** p_in, unsigned char** p_out,
// For example we write $'\\' or b'\\' not '\'
// The latter should be written r'\', but we're not outputing

#define STYLE_SQ 0 // 'foo'
#define STYLE_DOLLAR_SQ 1 // $'\xff'
#define STYLE_B_STRING 2 // b'\yff'
static inline int BourneShellEncodeOne(unsigned char** p_in,
unsigned char** p_out) {
unsigned char ch = **p_in;

// escape_style == 1 means $'\xff'
//
// escape_style == 2 means b'\yff' I think?
if (ch == '\'' || ch == '\\') { // can't encode these in Bourne shell ''
return 1;
}
if (ch < 0x20) { // Unprintable ASCII control codes
return 1;
}

// UTF-8 encoded runes and invalid bytes
unsigned char* start = *p_in; // save start position
uint32_t codepoint = 0;
uint32_t state = UTF8_ACCEPT;

static inline int ShellEncodeOne(unsigned char** p_in, unsigned char** p_out,
int escape_style) {
J8EncodeOne(p_in, p_out, 1);
while (1) {
decode(&state, &codepoint, ch);
// printf(" state %d\n", state);
switch (state) {
case UTF8_REJECT: {
return 1;
}
case UTF8_ACCEPT: {
(*p_in)++;
// printf("start %p p_in %p\n", start, *p_in);
while (start < *p_in) {
J8_OUT(*start);
start++;
}
return 0;
}
default:
(*p_in)++; // advance, next UTF8_ACCEPT will write it
ch = **p_in;
break;
}
}
// Unreachable
}

// Right now \u001f and \u{1f} are the longest output sequences for a byte.
Expand All @@ -190,11 +314,20 @@ static inline int J8EncodeChunk(unsigned char** p_in, unsigned char* in_end,
return 0;
}

inline int ShellEncodeChunk(unsigned char** p_in, unsigned char* in_end,
unsigned char** p_out, unsigned char* out_end,
int escape_style) {
inline int BashDollarEncodeChunk(unsigned char** p_in, unsigned char* in_end,
unsigned char** p_out,
unsigned char* out_end) {
while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
BashDollarEncodeOne(p_in, p_out);
}
return 0;
}

inline int BourneShellEncodeChunk(unsigned char** p_in, unsigned char* in_end,
unsigned char** p_out,
unsigned char* out_end) {
while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
int cannot_encode = ShellEncodeOne(p_in, p_out, escape_style);
int cannot_encode = BourneShellEncodeOne(p_in, p_out);
if (cannot_encode) { // we need escaping, e.g. \u0001 or \'
return cannot_encode; // early return
}
Expand Down
8 changes: 4 additions & 4 deletions data_lang/j8_libc.c
Expand Up @@ -62,7 +62,7 @@ void EncodeBashDollarString(j8_buf_t in_buf, j8_buf_t* out_buf, int capacity) {
// printf("B iter %p < %p and %p < %p < %p\n", in, in_end, out_buf->data,
// out, out_end);
// Fill as much as we can
ShellEncodeChunk(&in, in_end, &out, out_end, STYLE_DOLLAR_SQ);
BashDollarEncodeChunk(&in, in_end, &out, out_end);
out_buf->len = out - out_buf->data; // recompute length

if (in >= in_end) {
Expand Down Expand Up @@ -173,13 +173,13 @@ void ShellEncodeString(j8_buf_t in_buf, j8_buf_t* out_buf, int escape_style) {
unsigned char* out_end = out_buf->data + capacity;
unsigned char** p_out = &out;

J8_OUT('"');
J8_OUT('\'');

while (true) {
// Fill in as much as we can
// printf("J8 iter %p < %p and %p < %p < %p\n", in, in_end, out_buf->data,
// out, out_end);
int cannot_encode = ShellEncodeChunk(&in, in_end, &out, out_end, STYLE_SQ);
int cannot_encode = BourneShellEncodeChunk(&in, in_end, &out, out_end);
if (cannot_encode) {
out_buf->len = 0; // rewind to begining
// printf("out %p out_end %p capacity %d\n", out, out_end, capacity);
Expand Down Expand Up @@ -214,7 +214,7 @@ void ShellEncodeString(j8_buf_t in_buf, j8_buf_t* out_buf, int escape_style) {
// printf("[1] out %p out_end %p\n", out, out_end);
}

J8_OUT('"');
J8_OUT('\'');
out_buf->len = out - out_buf->data;

J8_OUT('\0'); // NUL terminate for printf
Expand Down
3 changes: 3 additions & 0 deletions data_lang/j8_libc.h
Expand Up @@ -23,6 +23,9 @@ typedef struct j8_buf_t {

void J8EncodeString(j8_buf_t in_buf, j8_buf_t* out_buf, int j8_fallback);

#define STYLE_DOLLAR_SQ 1 // $'\xff'
#define STYLE_B_STRING 2 // b'\yff'

void ShellEncodeString(j8_buf_t in_buf, j8_buf_t* out_buf, int escape_style);

#endif // DATA_LANG_J8_LIBC_H
4 changes: 2 additions & 2 deletions data_lang/j8_libc_test.c
Expand Up @@ -24,7 +24,7 @@ TEST char_int_test() {
PASS();
}

TEST encode_test() {
TEST j8_encode_test() {
for (int i = 0; J8_TEST_CASES[i]; ++i) {
const char* s = J8_TEST_CASES[i];
int input_len = strlen(s);
Expand Down Expand Up @@ -128,7 +128,7 @@ GREATEST_MAIN_DEFS();
int main(int argc, char** argv) {
GREATEST_MAIN_BEGIN();

RUN_TEST(encode_test);
RUN_TEST(j8_encode_test);
RUN_TEST(shell_encode_test);
RUN_TEST(char_int_test);
RUN_TEST(can_omit_quotes_test);
Expand Down
2 changes: 2 additions & 0 deletions data_lang/j8_test_lib.c
Expand Up @@ -7,5 +7,7 @@ const char* J8_TEST_CASES[] = {
"foozz abcd \xfe \x1f",
"\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0e\x0f\x10\xfe",
"\xff\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0e\x0f\x10\xfe",
"C:\\Program Files\\",
"Fool's Gold", // single quote
NULL,
};

0 comments on commit 23756b8

Please sign in to comment.