Skip to content

Commit

Permalink
[j8] Start implementing ShellEncodeString()
Browse files Browse the repository at this point in the history
Set up some unit tests.

Right now the inner loop calls J8EncodeOne().
  • Loading branch information
Andy C committed Jan 29, 2024
1 parent e4a1841 commit e2ce626
Show file tree
Hide file tree
Showing 6 changed files with 208 additions and 45 deletions.
38 changes: 29 additions & 9 deletions data_lang/j8.h
Expand Up @@ -147,6 +147,30 @@ static inline int J8EncodeOne(unsigned char** p_in, unsigned char** p_out,
//
}

// Like the above, but
//
// escape_style == 0 - try shell 'foo'
// must be valid UTF-8
// no control chars
// no ' is required
// no \ -- not required, but avoids ambiguous '\n'
//
// For example we write $'\\' or b'\\' not '\'
// The latter should be written r'\', but we're not outputing

#define STYLE_SQ 0 // 'foo'
#define STYLE_DOLLAR_SQ 1 // $'\xff'
#define STYLE_B_STRING 2 // b'\yff'

// escape_style == 1 means $'\xff'
//
// escape_style == 2 means b'\yff' I think?

static inline int ShellEncodeOne(unsigned char** p_in, unsigned char** p_out,
int escape_style) {
J8EncodeOne(p_in, p_out, 1);
}

// Right now \u001f and \u{1f} are the longest output sequences for a byte.
// Bug fix: we need 6 + 1 for the NUL terminator that sprintf() writes! (Even
// though we don't technically need it)
Expand All @@ -166,17 +190,13 @@ static inline int J8EncodeChunk(unsigned char** p_in, unsigned char* in_end,
return 0;
}

// TODO: $'\x00\u1234' escaping
inline int ShellEncodeOne(unsigned char** p_in, unsigned char** p_out) {
return 0;
}

inline int ShellEncodeChunk(unsigned char** p_in, unsigned char* in_end,
unsigned char** p_out, unsigned char* out_end) {
unsigned char** p_out, unsigned char* out_end,
int escape_style) {
while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
int dollar_fallback = ShellEncodeOne(p_in, p_out);
if (dollar_fallback) { // we need escaping, e.g. \u0001 or \'
return dollar_fallback; // early return
int cannot_encode = ShellEncodeOne(p_in, p_out, escape_style);
if (cannot_encode) { // we need escaping, e.g. \u0001 or \'
return cannot_encode; // early return
}
}
return 0;
Expand Down
12 changes: 2 additions & 10 deletions data_lang/j8.py
Expand Up @@ -83,16 +83,8 @@ class Printer(object):

def __init__(self):
# type: () -> None
"""
Args:
# These can all be packed into the same byte. ASDL needs bit_set
# support I think?
options:
control j"" vs. ""
control escaping \\x and \\u escaping like QSN
pretty.UnquotedKeys - ASDL uses this?
"""
self.options = 0

# TODO: should remove this in favor of BufWriter method
self.spaces = {0: ''} # cache of strings with spaces

# Could be PrintMessage or PrintJsonMessage()
Expand Down
117 changes: 112 additions & 5 deletions data_lang/j8_libc.c
Expand Up @@ -44,6 +44,48 @@ void EncodeBString(j8_buf_t in_buf, j8_buf_t* out_buf, int capacity) {
J8_OUT('\0'); // NUL terminate for printf
}

// $'' escaping
// This function is a COPY of EncodeBString() above
void EncodeBashDollarString(j8_buf_t in_buf, j8_buf_t* out_buf, int capacity) {
// Compute pointers for the inner loop
unsigned char* in = (unsigned char*)in_buf.data;
unsigned char* in_end = in + in_buf.len;

unsigned char* out = out_buf->data; // mutated
unsigned char* out_end = out_buf->data + capacity;
unsigned char** p_out = &out;

J8_OUT('$'); // Left quote b''
J8_OUT('\'');

while (true) {
// printf("B iter %p < %p and %p < %p < %p\n", in, in_end, out_buf->data,
// out, out_end);
// Fill as much as we can
ShellEncodeChunk(&in, in_end, &out, out_end, STYLE_DOLLAR_SQ);
out_buf->len = out - out_buf->data; // recompute length

if (in >= in_end) {
break;
}

// Same growth policy as below
capacity = capacity * 3 / 2;
// printf("[2] new capacity %d\n", capacity);
out_buf->data = (unsigned char*)realloc(out_buf->data, capacity);

// Recompute pointers
out = out_buf->data + out_buf->len;
out_end = out_buf->data + capacity;
p_out = &out;
}

J8_OUT('\'');
out_buf->len = out - out_buf->data;

J8_OUT('\0'); // NUL terminate for printf
}

void J8EncodeString(j8_buf_t in_buf, j8_buf_t* out_buf, int j8_fallback) {
unsigned char* in = (unsigned char*)in_buf.data;
unsigned char* in_end = in + in_buf.len;
Expand Down Expand Up @@ -105,10 +147,75 @@ void J8EncodeString(j8_buf_t in_buf, j8_buf_t* out_buf, int j8_fallback) {
J8_OUT('\0'); // NUL terminate for printf
}

// $'' escaping, very similar to J8
void BashDollarEncodeString(j8_buf_t in_buf, j8_buf_t* out_buf) {
}

// Start with '', but fall back on $'' for ASCII control and \'
void ShellEncodeString(j8_buf_t in_buf, j8_buf_t* out_buf) {
//
// Depending on options, fall back to
//
// EncodeBashDollarString() -- $'\xff'
// EncodeBString() -- b'\yff'

// Mostly a COPY of the above
void ShellEncodeString(j8_buf_t in_buf, j8_buf_t* out_buf, int escape_style) {
unsigned char* in = (unsigned char*)in_buf.data;
unsigned char* in_end = in + in_buf.len;

// Growth policy: Start at a fixed size min(N + 3 + 2, 16)
int capacity = in_buf.len + 3 + 2; // 3 for quotes, 2 potential \" \n
if (capacity < 16) { // account for J8_MAX_BYTES_PER_INPUT_BYTE
capacity = 16;
}
// printf("[1] capacity %d j8_fallback %d\n", capacity, j8_fallback);

out_buf->data = (unsigned char*)malloc(capacity);
out_buf->len = 0; // starts out empty

unsigned char* out = out_buf->data; // mutated
unsigned char* out_end = out_buf->data + capacity;
unsigned char** p_out = &out;

J8_OUT('"');

while (true) {
// Fill in as much as we can
// printf("J8 iter %p < %p and %p < %p < %p\n", in, in_end, out_buf->data,
// out, out_end);
int cannot_encode = ShellEncodeChunk(&in, in_end, &out, out_end, STYLE_SQ);
if (cannot_encode) {
out_buf->len = 0; // rewind to begining
// printf("out %p out_end %p capacity %d\n", out, out_end, capacity);
if (escape_style == STYLE_DOLLAR_SQ) {
EncodeBashDollarString(in_buf, out_buf, capacity); // fall back to $''
} else {
EncodeBString(in_buf, out_buf, capacity); // fall back to b''
}
// printf("len %d\n", out_buf->len);
return;
}
out_buf->len = out - out_buf->data; // recompute length
// printf("[1] len %d\n", out_buf->len);

if (in >= in_end) {
break;
}

// Growth policy: every time through the loop, increase 1.5x
//
// The worst blowup is 6x, and 1.5 ** 5 > 6, so it will take 5 reallocs.
// This seems like a reasonable tradeoff between over-allocating and too
// many realloc().
capacity = capacity * 3 / 2;
// printf("[1] new capacity %d\n", capacity);
out_buf->data = (unsigned char*)realloc(out_buf->data, capacity);

// Recompute pointers
out = out_buf->data + out_buf->len;
out_end = out_buf->data + capacity;
p_out = &out;
// printf("[1] out %p out_end %p\n", out, out_end);
}

J8_OUT('"');
out_buf->len = out - out_buf->data;

J8_OUT('\0'); // NUL terminate for printf
}
2 changes: 1 addition & 1 deletion data_lang/j8_libc.h
Expand Up @@ -23,6 +23,6 @@ typedef struct j8_buf_t {

void J8EncodeString(j8_buf_t in_buf, j8_buf_t* out_buf, int j8_fallback);

void ShellEncodeString(j8_buf_t in_buf, j8_buf_t* out_buf);
void ShellEncodeString(j8_buf_t in_buf, j8_buf_t* out_buf, int escape_style);

#endif // DATA_LANG_J8_LIBC_H
44 changes: 44 additions & 0 deletions data_lang/j8_libc_test.c
Expand Up @@ -67,6 +67,49 @@ TEST encode_test() {
PASS();
}

TEST shell_encode_test() {
for (int i = 0; J8_TEST_CASES[i]; ++i) {
const char* s = J8_TEST_CASES[i];
int input_len = strlen(s);
j8_buf_t in = {(unsigned char*)s, input_len};

// printf("input '%s' %d\n", in.data, input_len);

j8_buf_t result = {0};
ShellEncodeString(in, &result, STYLE_DOLLAR_SQ);

printf("result %s\n", result.data);
printf("result.len %d\n", result.len);

// Some sanity checks
int n = strlen(s);
switch (n) {
case 0: // empty string -> ""
ASSERT_EQ_FMT(2, result.len, "%d");
break;
case 1: // x -> "x"
ASSERT_EQ_FMT(3, result.len, "%d");
break;
default:
ASSERT(input_len < result.len);
break;
}
free(result.data);

// Encode again with J8 fallback
result = {0};
ShellEncodeString(in, &result, STYLE_B_STRING);

printf("result %s\n", result.data);
printf("result.len %d\n", result.len);
free(result.data);

printf("\n");
}

PASS();
}

TEST can_omit_quotes_test() {
const char* s = "foo";
ASSERT(CanOmitQuotes((unsigned char*)s, strlen(s)));
Expand All @@ -86,6 +129,7 @@ int main(int argc, char** argv) {
GREATEST_MAIN_BEGIN();

RUN_TEST(encode_test);
RUN_TEST(shell_encode_test);
RUN_TEST(char_int_test);
RUN_TEST(can_omit_quotes_test);

Expand Down
40 changes: 20 additions & 20 deletions data_lang/j8_test.cc
Expand Up @@ -140,7 +140,7 @@ void EncodeString(char* s, int n, std::string* result, int j8_fallback) {
// printf("RETRY\n");
result->erase(begin_index, std::string::npos);
EncodeBString(s, n, result); // fall back to b''
printf("\t[1] result len %d\n", result->size());
printf("\t[1] result len %d\n", static_cast<int>(result->size()));
return;
}

Expand All @@ -152,7 +152,7 @@ void EncodeString(char* s, int n, std::string* result, int j8_fallback) {
result->erase(end_index, std::string::npos);
}
result->append("\"");
printf("\t[1] result len %d\n", result->size());
printf("\t[1] result len %d\n", static_cast<int>(result->size()));
}

void EncodeAndPrint(char* s, int n, int j8_fallback) {
Expand All @@ -177,33 +177,33 @@ void EncodeAndPrint(char* s, int n, int j8_fallback) {

TEST encode_test() {
#if 1
char* mixed = "hi \x01 \u4000\xfe\u4001\xff\xfd ' \" new \n \\ \u03bc";
EncodeAndPrint(mixed, strlen(mixed), 0);
EncodeAndPrint(mixed, strlen(mixed), 1);
const char* mixed = "hi \x01 \u4000\xfe\u4001\xff\xfd ' \" new \n \\ \u03bc";
EncodeAndPrint(const_cast<char*>(mixed), strlen(mixed), 0);
EncodeAndPrint(const_cast<char*>(mixed), strlen(mixed), 1);
#endif

char* a = "ab";
EncodeAndPrint(a, strlen(a), 0);
EncodeAndPrint(a, strlen(a), 1);
const char* a = "ab";
EncodeAndPrint(const_cast<char*>(a), strlen(a), 0);
EncodeAndPrint(const_cast<char*>(a), strlen(a), 1);

char* b = "0123456789";
EncodeAndPrint(b, strlen(b), 0);
EncodeAndPrint(b, strlen(b), 1);
const char* b = "0123456789";
EncodeAndPrint(const_cast<char*>(b), strlen(b), 0);
EncodeAndPrint(const_cast<char*>(b), strlen(b), 1);

char* u = "hi \u4000 \u03bc";
EncodeAndPrint(u, strlen(u), 0);
EncodeAndPrint(u, strlen(u), 1);
const char* u = "hi \u4000 \u03bc";
EncodeAndPrint(const_cast<char*>(b), strlen(u), 0);
EncodeAndPrint(const_cast<char*>(b), strlen(u), 1);

// Internal NUL
char* bin = "\x00\x01\xff";
EncodeAndPrint(bin, 3, 0);
EncodeAndPrint(bin, 3, 1);
const char* bin = "\x00\x01\xff";
EncodeAndPrint(const_cast<char*>(bin), 3, 0);
EncodeAndPrint(const_cast<char*>(bin), 3, 1);

// Blow up size
char* blowup =
const char* blowup =
"\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0e\x0f\x10\xfe";
EncodeAndPrint(blowup, strlen(blowup), 0);
EncodeAndPrint(blowup, strlen(blowup), 1);
EncodeAndPrint(const_cast<char*>(blowup), strlen(blowup), 0);
EncodeAndPrint(const_cast<char*>(blowup), strlen(blowup), 1);

PASS();
}
Expand Down

0 comments on commit e2ce626

Please sign in to comment.