Skip to content
/ net Public
forked from golang/net

Commit

Permalink
html: Added ParseOptionIgnoreBOM option
Browse files Browse the repository at this point in the history
This option treats the UTF-8 BOM, if present, as whitespace to
prevent moving comments into the body element.

This is mainly intended for use with RenderOptionAllowXMLDeclarations
to prevent the XML declaration being moved into the body element
(which is invalid). See pgaskin/kepubify#36 for an example of this.
  • Loading branch information
pgaskin committed Jan 12, 2020
1 parent 85aecc8 commit fd41993
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 0 deletions.
19 changes: 19 additions & 0 deletions html/parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ type parser struct {
// to be self closing (<whatever ... />). This is mainly for better
// compatibility with XHTML found in EPUBs. MOD(geek1011)
lenientSelfClosing bool
// ignoreBOM skips the BOM at the beginning of the document, if present.
// MOD(geek1011)
ignoreBOM bool
}

func (p *parser) top() *Node {
Expand Down Expand Up @@ -521,6 +524,11 @@ const whitespace = " \t\r\n\f"
func initialIM(p *parser) bool {
switch p.tok.Type {
case TextToken:
// MOD(geek1011): Ignore BOM when considering initial data before document.
if p.ignoreBOM {
p.tok.Data = strings.TrimPrefix(p.tok.Data, "\xEF\xBB\xBF")
}
// END MOD
p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
if len(p.tok.Data) == 0 {
// It was all whitespace, so ignore it.
Expand Down Expand Up @@ -2410,6 +2418,17 @@ func ParseOptionLenientSelfClosing(enable bool) ParseOption {
}
}

// ParseOptionIgnoreBOM skips reading the UTF-8 BOM (EF BB BF), if present, at
// the beginning of the document (technically, it makes the parser consider it
// as whitespace). This option is mainly intended for use with with
// RenderOptionAllowXMLDeclarations to prevent the XML declaration being moved
// into the body element. MOD(geek1011)
func ParseOptionIgnoreBOM(enable bool) ParseOption {
return func(p *parser) {
p.ignoreBOM = enable
}
}

// ParseWithOptions is like Parse, with options.
func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) {
p := &parser{
Expand Down
15 changes: 15 additions & 0 deletions html/parse_geek1011_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,3 +189,18 @@ func TestMod_ParseLenientSelfClosing(t *testing.T) {
RenderedB: `<!DOCTYPE html><html><head><title>Title</title></head><body><p><i>Test &gt;&lt;<span id="test"></span>&gt;<b>&lt; 1<span>test</span></b></i></p><p><i><b>Test 2</b></i></p></body></html>`,
}.Test(t)
}

func TestMod_ParseIgnoreBOM(t *testing.T) {
testModCase{
What: `BOM and comment`,
Original: "\xEF\xBB\xBF" + `<!-- Comment Text --><!DOCTYPE html><html><head><title>Title</title></head><body><p>Test 1</p></body></html>`,

ParseOptsA: []ParseOption{ParseOptionIgnoreBOM(false)},
RenderOptsA: nil,
RenderedA: `<html><head></head><body>` + "\xEF\xBB\xBF" + `<!-- Comment Text --><title>Title</title><p>Test 1</p></body></html>`,

ParseOptsB: []ParseOption{ParseOptionIgnoreBOM(true)},
RenderOptsB: nil,
RenderedB: `<!-- Comment Text --><!DOCTYPE html><html><head><title>Title</title></head><body><p>Test 1</p></body></html>`,
}.Test(t)
}

0 comments on commit fd41993

Please sign in to comment.