Skip to content

Commit

Permalink
[j8] declare -p uses new fastfunc.ShellEncodeString()
Browse files Browse the repository at this point in the history
Wrote C++ functions that write to BufWriter().  Though we don't need
BufWriter right now, it's easy to follow the style of J8EncodeString().

TODO:

- these functions can be moved to j8_lite.py
- use j8_lite.py everywhere, and then QSN can be deleted.
  - qsn.maybe_shell_encode() -> j8_lite.MaybeShellEncode()
  • Loading branch information
Andy C committed Jan 29, 2024
1 parent 80c82ca commit dc5bb5d
Show file tree
Hide file tree
Showing 5 changed files with 199 additions and 53 deletions.
162 changes: 135 additions & 27 deletions cpp/data_lang.cc
Expand Up @@ -8,52 +8,52 @@
// TODO: remove duplication
#define LOSSY_JSON (1 << 3)

namespace fastfunc {

bool CanOmitQuotes(BigStr* s) {
return ::CanOmitQuotes(reinterpret_cast<unsigned char*>(s->data_), len(s));
}
namespace {

BigStr* J8EncodeString(BigStr* s, int j8_fallback) {
auto buf = Alloc<mylib::BufWriter>();
int options = j8_fallback ? 0 : LOSSY_JSON;
pyj8::WriteString(s, options, buf);
return buf->getvalue();
}
void WriteBString(BigStr* s, mylib::BufWriter* buf, int capacity) {
uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));

} // namespace fastfunc
buf->WriteConst("b'");

namespace pyj8 {
// Set up pointers after writing opening quote
uint8_t* out = buf->LengthPointer(); // mutated
uint8_t* out_end = buf->CapacityPointer();

bool PartIsUtf8(BigStr* s, int start, int end) {
uint32_t codepoint;
uint32_t state = UTF8_ACCEPT;
while (true) {
J8EncodeChunk(&in, in_end, &out, out_end, true); // Fill as much as we can
buf->SetLengthFrom(out);

for (int i = start; i < end; ++i) {
// This var or a static_cast<> is necessary. Should really change BigStr*
// to use unsigned type
uint8_t c = s->data_[i];
decode(&state, &codepoint, c);
if (state == UTF8_REJECT) {
return false;
if (in >= in_end) {
break;
}

// Same growth policy as below
capacity = capacity * 3 / 2;
// printf("[2] new capacity %d\n", capacity);
buf->EnsureMoreSpace(capacity);

// Recompute pointers
out = buf->LengthPointer();
out_end = buf->CapacityPointer();
}

return state == UTF8_ACCEPT;
buf->WriteConst("'");
}

void WriteBString(BigStr* s, mylib::BufWriter* buf, int capacity) {
void WriteBashDollarString(BigStr* s, mylib::BufWriter* buf, int capacity) {
uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));

buf->WriteConst("b'");
buf->WriteConst("$'");

// Set up pointers after writing opening quote
uint8_t* out = buf->LengthPointer(); // mutated
uint8_t* out_end = buf->CapacityPointer();

while (true) {
J8EncodeChunk(&in, in_end, &out, out_end, true); // Fill as much as we can
BashDollarEncodeChunk(&in, in_end, &out,
out_end); // Fill as much as we can
buf->SetLengthFrom(out);

if (in >= in_end) {
Expand All @@ -73,6 +73,114 @@ void WriteBString(BigStr* s, mylib::BufWriter* buf, int capacity) {
buf->WriteConst("'");
}

// Style is COPIED from pyj8::WriteString()
// Functionality is like j8_libc.c ShellEncodeString, that is:
//
// call BourneShellEncodeChunk()
// then either
// WriteBString()
// WriteBashDollarString()

void ShellEncodeString(BigStr* s, int ysh_fallback, mylib::BufWriter* buf) {
uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));

// Growth policy: Start at a fixed size min(N + 3 + 2, 16)
int capacity = len(s) + 3 + 2; // 3 for quotes, 2 potential \" \n
if (capacity < 16) { // account for J8_MAX_BYTES_PER_INPUT_BYTE
capacity = 16;
}
// printf("[1] capacity %d\n", capacity);

buf->EnsureMoreSpace(capacity);

int begin = buf->Length(); // maybe Truncate to this position
buf->WriteConst("'");

// Set up pointers after writing opening quote
uint8_t* out = buf->LengthPointer(); // mutated
uint8_t* out_end = buf->CapacityPointer();

while (true) {
// Fill in as much as we can
int cannot_encode = BourneShellEncodeChunk(&in, in_end, &out, out_end);
if (cannot_encode) {
buf->Truncate(begin);
if (ysh_fallback) {
WriteBString(s, buf, capacity); // fall back to b''
} else {
WriteBashDollarString(s, buf, capacity); // fall back to $''
}
return;
}
buf->SetLengthFrom(out);

// printf("[1] len %d\n", out_buf->len);

if (in >= in_end) {
break;
}

// Growth policy: every time through the loop, increase 1.5x
//
// The worst blowup is 6x, and 1.5 ** 5 > 6, so it will take 5 reallocs.
// This seems like a reasonable tradeoff between over-allocating and too
// many realloc().
capacity = capacity * 3 / 2;
// printf("[1] new capacity %d\n", capacity);
buf->EnsureMoreSpace(capacity);

// Recompute pointers
out = buf->LengthPointer(); // mutated
out_end = buf->CapacityPointer();
// printf("[1] out %p out_end %p\n", out, out_end);
}

buf->WriteConst("'");
}

} // namespace

namespace fastfunc {

bool CanOmitQuotes(BigStr* s) {
return ::CanOmitQuotes(reinterpret_cast<unsigned char*>(s->data_), len(s));
}

BigStr* J8EncodeString(BigStr* s, int j8_fallback) {
auto buf = Alloc<mylib::BufWriter>();
int options = j8_fallback ? 0 : LOSSY_JSON;
pyj8::WriteString(s, options, buf);
return buf->getvalue();
}

BigStr* ShellEncodeString(BigStr* s, int ysh_fallback) {
auto buf = Alloc<mylib::BufWriter>();
::ShellEncodeString(s, ysh_fallback, buf);
return buf->getvalue();
}

} // namespace fastfunc

namespace pyj8 {

bool PartIsUtf8(BigStr* s, int start, int end) {
uint32_t codepoint;
uint32_t state = UTF8_ACCEPT;

for (int i = start; i < end; ++i) {
// This var or a static_cast<> is necessary. Should really change BigStr*
// to use unsigned type
uint8_t c = s->data_[i];
decode(&state, &codepoint, c);
if (state == UTF8_REJECT) {
return false;
}
}

return state == UTF8_ACCEPT;
}

void WriteString(BigStr* s, int options, mylib::BufWriter* buf) {
bool j8_fallback = !(options & LOSSY_JSON);

Expand Down
2 changes: 2 additions & 0 deletions cpp/data_lang.h
Expand Up @@ -10,6 +10,8 @@ bool CanOmitQuotes(BigStr* s);

BigStr* J8EncodeString(BigStr* s, int j8_fallback);

BigStr* ShellEncodeString(BigStr* s, int ysh_fallback);

} // namespace fastfunc

namespace pyj8 {
Expand Down
40 changes: 16 additions & 24 deletions data_lang/j8.py
Expand Up @@ -4,14 +4,6 @@
TODO:
- Translate the whole thing to C++
- use Bjoern DFA for UTF-8 validation in printing and parsing
- move more of LexerDecoder out of pyj8.py? I think it can translate
- Remove most of QSN
- QSN maybe_shell_encode() is used for bash features
- Remove shell_compat which does \\x00 instead of \\0
- Many more tests
- Run JSONTestSuite
Expand All @@ -22,12 +14,7 @@
- line wrapping -- do this later
- would like CONTRIBUTORS here
- Harmonize the API in data_lang/qsn.py
- use mylib.BufWriter output
- use u_style.LiteralUtf8 instead of BIT8_UTF8, etc.
- Unify with ASDL pretty printing?
{} [] are for JSON?
() is for statically typed ASDL data?
(command.Simple blame_tok:(...) words:[ ])
Expand All @@ -42,12 +29,13 @@
from core import error
from core import vm
from data_lang import pyj8
from data_lang import qsn
from frontend import consts
from frontend import match
from mycpp import mylib
from mycpp.mylib import tagswitch, iteritems, NewDict, log

import fastfunc

_ = log

from typing import cast, Dict, List, Tuple, Optional
Expand All @@ -62,23 +50,27 @@

def MaybeShellEncode(s):
# type: (str) -> str
return qsn.maybe_shell_encode(s)
"""
This is like ShellEncode(s, unquoted_ok=True)
But it's common, so we give it a shorter name.
"""
if fastfunc.CanOmitQuotes(s):
return s

return fastfunc.ShellEncodeString(s, 0) # no ysh_fallback


def ShellEncode(s):
# type: (str) -> str
return fastfunc.ShellEncodeString(s, 0) # no ysh_fallback

# TODO: call fastfunc.ShellEncodeString()
return qsn.maybe_shell_encode(s, flags=qsn.MUST_QUOTE)

def YshEncode(s, unquoted_ok=False):
# type: (str, bool) -> str
if unquoted_ok and fastfunc.CanOmitQuotes(s):
return s

def YshEncode(s):
# type: (str) -> str

# TODO: call fastfunc.ShellEncodeString(STYLE_B_STRING)
#
# ysh_fallback -- b'' style
return qsn.maybe_shell_encode(s, flags=qsn.MUST_QUOTE)
return fastfunc.ShellEncodeString(s, 1) # ysh_fallback


class Printer(object):
Expand Down
1 change: 0 additions & 1 deletion data_lang/j8_lite.py
Expand Up @@ -15,7 +15,6 @@ def EncodeString(s, unquoted_ok=False):
that method, then call BufWriter.clear() in between.
"""
if unquoted_ok and fastfunc.CanOmitQuotes(s):
#if unquoted_ok and fastlex.CanOmitQuotes(s):
return s

return fastfunc.J8EncodeString(s, 1) # j8_fallback is true
47 changes: 46 additions & 1 deletion spec/assign-extended.test.sh
Expand Up @@ -220,6 +220,51 @@ typeset -x test_var3=333
typeset test_var5=555
## END

#### declare -p doesn't print binary data, but can be loaded into bash

# bash prints binary data!
case $SH in bash|mksh) exit ;; esac

unquoted='foo'
sq='foo bar'
bash1=$'\x1f' # ASCII control char
bash2=$'\xfe\xff' # Invalid UTF-8

s1=$unquoted
s2=$sq
s3=$bash1
s4=$bash2

declare -a a=("$unquoted" "$sq" "$bash1" "$bash2")
declare -A A=(["$unquoted"]="$sq" ["$bash1"]="$bash2")

#echo lengths ${#s1} ${#s2} ${#s3} ${#s4} ${#a[@]} ${#A[@]}

declare -p s1 s2 s3 s4 a A | tee tmp.bash

echo ---

bash -c 'source tmp.bash; echo "$s1 $s2"; echo -n "$s3" "$s4" | od -A n -t x1'
echo bash=$?

## STDOUT:
declare -- s1=foo
declare -- s2='foo bar'
declare -- s3=$'\u001f'
declare -- s4=$'\xfe\xff'
declare -a a=(foo 'foo bar' $'\u001f' $'\xfe\xff')
declare -A A=([$'\u001f']=$'\xfe\xff' ['foo']='foo bar')
---
foo foo bar
1f 20 fe ff
bash=0
## END

## N-I bash/mksh STDOUT:
## END



#### declare -p var
# BUG? bash doesn't output anything for 'local/readonly -p var', which seems to
# contradict with manual. Besides, 'export -p var' is not described in
Expand Down Expand Up @@ -388,7 +433,7 @@ declare -A test_var7=()

#### declare -pnrx var
# Note: Bash ignores other flags (-nrx) when variable names are supplied while
# Oil uses other flags to select variables. Bash's behavior is documented.
# OSH uses other flags to select variables. Bash's behavior is documented.
test_var1=111
readonly test_var2=222
export test_var3=333
Expand Down

0 comments on commit dc5bb5d

Please sign in to comment.