Skip to content

Commit

Permalink
[unicode] Consolidate surrogate range and max code point checks
Browse files Browse the repository at this point in the history
  • Loading branch information
Andy Chu committed May 22, 2024
1 parent 4aab5cb commit f0157e0
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 12 deletions.
14 changes: 10 additions & 4 deletions osh/word_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,17 +136,23 @@ def EvalSingleQuoted(id_, tokens):
# EvalCStringToken() redoes some of this work, but right now it's
# shared with dynamic echo -e / printf, which don't have tokens.

if t.id == Id.Char_Unicode8: # check for invalid \U00110000
i = -1
if t.id in (Id.Char_Unicode4, Id.Char_Unicode8):
s = lexer.TokenSliceLeft(t, 2)
i = int(s, 16)
if i > 0x10ffff:
p_die("Code point can't be greater than U+10ffff", t)

elif t.id == Id.Char_UBraced: # check for invalid \u{110000}
elif t.id == Id.Char_UBraced:
s = lexer.TokenSlice(t, 3, -1)
i = int(s, 16)

if i != -1:
# check for invalid \U00110000 or \u{110000}
if i > 0x10ffff:
p_die("Code point can't be greater than U+10ffff", t)
if 0xD800 <= i and i < 0xE000:
p_die(
r"%s escape is illegal because it's in the surrogate range"
% lexer.TokenVal(t), t)

strs.append(EvalCStringToken(t.id, lexer.TokenVal(t)))

Expand Down
8 changes: 0 additions & 8 deletions osh/word_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,14 +736,6 @@ def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
p_die(
r"%s escapes not allowed in u'' strings" %
lexer.TokenVal(tok), tok)
# \u{dc00} isn't valid
if tok.id == Id.Char_UBraced:
h = lexer.TokenSlice(tok, 3, -1) # \u{123456}
i = int(h, 16)
if 0xD800 <= i and i < 0xE000:
p_die(
r"%s escape is illegal because it's in the surrogate range"
% lexer.TokenVal(tok), tok)

out_tokens.extend(tokens)
return self.cur_token
Expand Down
36 changes: 36 additions & 0 deletions spec/unicode.test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,42 @@ py-repr $'\U00110000'
'\xf4\x90\x80\x80'
## END

#### $'' checks for surrogate range at parse time

case $SH in mksh) exit ;; esac

$SH << 'EOF'
x=$'\udc00'
EOF
if test $? -ne 0; then
echo pass
else
echo fail
fi

$SH << 'EOF'
x=$'\U0000dc00'
EOF
if test $? -ne 0; then
echo pass
else
echo fail
fi


## STDOUT:
pass
pass
## END

## BUG bash STDOUT:
fail
fail
## END

## BUG mksh STDOUT:
## END


#### printf / echo -e check that 0x110000 is too big at runtime
case $SH in mksh) exit ;; esac
Expand Down

0 comments on commit f0157e0

Please sign in to comment.