Skip to content
Permalink
Browse files

syntax: refactor how escaped newlines are handled

We used to treat escaped newlines like any other escaped character in
the lexer. The parser would then deal with them as needed.

This produced inconsistent syntax trees. For example, an escaped newline
within double quotes should be skipped as per the POSIX Shell spec, but
we would include it as part of a literal. Then, the expand package would
be in charge of getting rid of it.

Instead, treat escaped lines especially in the lexer, conforming with
the spec. Most code gets simpler, particularly in the parser, as it
doesn't have to deal with skipping and discarding bytes any more. Two
cases want to keep escaped newlines (single quotes and comments), but
that's a simple append.

Double-quoted strings now represent escaped newlines by separating the
parts in different lines, even if the two parts are just *Lit. This is
enough information for the printer to do the right thing.

Besides the simplification in the lexer and in the code, the syntax tree
is now closer to what people would expect when wanting to expand or
interpret it.

Finally, this requires adding another special rune to the lexer. We were
already using RuneSelf as an EOF rune, so RuneSelf+1 now represents an
escaped newline.

Fixes #321.
  • Loading branch information...
mvdan committed Jun 2, 2019
1 parent a7c4d6e commit 9162b76c1a0d8a750dee37a77d79f803bc1398a2
Showing with 44 additions and 52 deletions.
  1. +0 −3 expand/expand.go
  2. +1 −4 syntax/filetests_test.go
  3. +26 −31 syntax/lexer.go
  4. +6 −10 syntax/parser.go
  5. +8 −2 syntax/printer.go
  6. +3 −2 syntax/printer_test.go
@@ -426,9 +426,6 @@ func (cfg *Config) wordField(wps []syntax.WordPart, ql quoteLevel) ([]fieldPart,
b := s[i]
if b == '\\' && i+1 < len(s) {
switch s[i+1] {
case '\n': // remove \\\n
i++
continue
case '"', '\\', '$', '`': // special chars
continue
}
@@ -4244,10 +4244,7 @@ func clearPosRecurse(tb testing.TB, src string, v interface{}) {
gotErr = got
}
got = strings.Replace(got, "\\\n", "", -1)
if len(got) > len(want) {
got = got[:len(want)]
}
if got == want {
if strings.HasPrefix(got, want) {
return
}
}
@@ -55,8 +55,10 @@ func bquoteEscaped(b byte) bool {
return false
}

const escNewl rune = utf8.RuneSelf + 1

func (p *Parser) rune() rune {
if p.r == '\n' {
if p.r == '\n' || p.r == escNewl {
// p.r instead of b so that newline
// character positions don't have col 0.
p.npos.line++
@@ -68,11 +70,14 @@ retry:
if p.bsp < len(p.bs) {
if b := p.bs[p.bsp]; b < utf8.RuneSelf {
p.bsp++
if b == '\\' && p.openBquotes > 0 {
// don't do it for newlines, as we want
// the newlines to be eaten in p.next
if bquotes < p.openBquotes && p.bsp < len(p.bs) &&
bquoteEscaped(p.bs[p.bsp]) {
if b == '\\' {
if p.r != '\\' && p.peekByte('\n') {
p.bsp++
p.w, p.r = 1, escNewl
return escNewl
}
if p.openBquotes > 0 && bquotes < p.openBquotes &&
p.bsp < len(p.bs) && bquoteEscaped(p.bs[p.bsp]) {
bquotes++
goto retry
}
@@ -194,6 +199,9 @@ func (p *Parser) next() {
p.tok = _EOF
return
}
for p.r == escNewl {
p.rune()
}
p.spaced = false
if p.quote&allKeepSpaces != 0 {
p.nextKeepSpaces()
@@ -206,6 +214,8 @@ skipSpace:
case utf8.RuneSelf:
p.tok = _EOF
return
case escNewl:
r = p.rune()
case ' ', '\t', '\r':
p.spaced = true
r = p.rune()
@@ -221,12 +231,6 @@ skipSpace:
p.doHeredocs()
}
return
case '\\':
if !p.peekByte('\n') {
break skipSpace
}
p.rune()
r = p.rune()
default:
break skipSpace
}
@@ -249,7 +253,10 @@ skipSpace:
case '#':
r = p.rune()
p.newLit(r)
for r != utf8.RuneSelf && r != '\n' {
for r != '\n' && r != utf8.RuneSelf {
if r == escNewl {
p.litBs = append(p.litBs, '\\', '\n')
}
r = p.rune()
}
if p.keepComments {
@@ -741,7 +748,7 @@ func (p *Parser) newLit(r rune) {
case r < utf8.RuneSelf:
p.litBs = p.litBuf[:1]
p.litBs[0] = byte(r)
case r > utf8.RuneSelf:
case r > escNewl:
w := utf8.RuneLen(r)
p.litBs = append(p.litBuf[:0], p.bs[p.bsp-w:p.bsp]...)
default:
@@ -751,10 +758,8 @@ func (p *Parser) newLit(r rune) {
}
}

func (p *Parser) discardLit(n int) { p.litBs = p.litBs[:len(p.litBs)-n] }

func (p *Parser) endLit() (s string) {
if p.r == utf8.RuneSelf {
if p.r == utf8.RuneSelf || p.r == escNewl {
s = string(p.litBs)
} else {
s = string(p.litBs[:len(p.litBs)-int(p.w)])
@@ -783,17 +788,11 @@ func (p *Parser) advanceNameCont(r rune) {
loop:
for p.newLit(r); r != utf8.RuneSelf; r = p.rune() {
switch {
case r == '\\':
if p.peekByte('\n') {
p.rune()
p.discardLit(2)
} else {
break loop
}
case 'a' <= r && r <= 'z':
case 'A' <= r && r <= 'Z':
case r == '_':
case '0' <= r && r <= '9':
case r == escNewl:
default:
break loop
}
@@ -807,9 +806,7 @@ loop:
for p.newLit(r); r != utf8.RuneSelf; r = p.rune() {
switch r {
case '\\': // escaped byte follows
if r = p.rune(); r == '\n' {
p.discardLit(2)
}
p.rune()
case '"', '`', '$':
tok = _Lit
break loop
@@ -852,9 +849,7 @@ loop:
case ' ', '\t', '\n', '\r', '&', '|', ';', '(', ')':
break loop
case '\\': // escaped byte follows
if r = p.rune(); r == '\n' {
p.discardLit(2)
}
p.rune()
case '>', '<':
if p.peekByte('(') {
tok = _Lit
@@ -898,7 +893,7 @@ loop:
break loop
case '\\': // escaped byte follows
p.rune()
case '`', '$':
case escNewl, '`', '$':
tok = _Lit
break loop
}
@@ -126,7 +126,7 @@ func (w *wrappedReader) Read(p []byte) (n int, err error) {
// If we lexed a newline for the first time, we just finished a line, so
// we may need to give a callback for the edge cases below not covered
// by Parser.Stmts.
if w.r == '\n' && w.npos.line > w.lastLine {
if (w.r == '\n' || w.r == escNewl) && w.npos.line > w.lastLine {
if w.Incomplete() {
// Incomplete statement; call back to print "> ".
if !w.fn(w.accumulated) {
@@ -967,15 +967,9 @@ func (p *Parser) wordPart() WordPart {
pe.Param = p.getLit()
if pe.Param != nil && pe.Param.Value == "" {
l := p.lit(pe.Dollar, "$")
if p.val == "" {
// e.g. "$\\\n" followed by a closing double
// quote, so we need the next token.
p.next()
} else {
// e.g. "$\\\"" within double quotes, so we must
// keep the rest of the literal characters.
l.ValueEnd = posAddCol(l.ValuePos, 1)
}
// e.g. "$\\\"" within double quotes, so we must
// keep the rest of the literal characters.
l.ValueEnd = posAddCol(l.ValuePos, 1)
return l
}
return pe
@@ -1008,6 +1002,8 @@ func (p *Parser) wordPart() WordPart {
p.rune()
p.next()
return sq
case escNewl:
p.litBs = append(p.litBs, '\\', '\n')
case utf8.RuneSelf:
p.tok = _EOF
p.quoteErr(sq.Pos(), sglQuote)
@@ -97,8 +97,10 @@ func (p *Printer) Print(w io.Writer, node Node) error {
p.line = x.Pos().Line()
p.command(x, nil)
case *Word:
p.line = x.Pos().Line()
p.word(x)
case WordPart:
p.line = x.Pos().Line()
p.wordPart(x, nil)
default:
return fmt.Errorf("unsupported node type: %T", x)
@@ -493,12 +495,16 @@ func (p *Printer) comments(comments ...Comment) {
}

func (p *Printer) wordParts(wps []WordPart) {
for i, n := range wps {
for i, wp := range wps {
var next WordPart
if i+1 < len(wps) {
next = wps[i+1]
}
p.wordPart(n, next)
if pos := wp.Pos(); pos.Line() > p.line {
p.bslashNewl()
}
p.wordPart(wp, next)
p.line = wp.End().Line()
}
}

@@ -125,11 +125,11 @@ var printTests = []printCase{
},
{
"a b\\\nc d",
"a bc \\\n\td",
"a bc d",
},
{
"a bb\\\ncc d",
"a bbcc \\\n\td",
"a bbcc d",
},
samePrint("a \\\n\tb \\\n\tc \\\n\t;"),
samePrint("a=1 \\\n\tb=2 \\\n\tc=3 \\\n\t;"),
@@ -392,6 +392,7 @@ var printTests = []printCase{
},
samePrint("\"foo\n$(bar)\""),
samePrint("\"foo\\\n$(bar)\""),
samePrint("\"foo\\\nbar\""),
samePrint("((foo++)) || bar"),
{
"a=b \\\nc=d \\\nfoo",

0 comments on commit 9162b76

Please sign in to comment.
You can’t perform that action at this time.