Skip to content

Commit

Permalink
Atom: use correct xml:base for decoded elements (#222)
Browse files Browse the repository at this point in the history
* Atom: use correct xml:base for decoded elements

In order to keep tracking xml:base correctly, the goxpp's
`DecodeElement` pops the BaseStack if the start element added a base (if
any).

That means the atom parser needs keep track of the base *before* calling
`DecodeElement` to use for resolving relative URLs within the decoded
element.

Without this fix, elements with xml:base attributes will be erroneously
resolved with the parent xml:base.

* Depend on updated goxpp version without xml:base bug

* Resolve xml:base URLs without switching out the BaseStack

This provides an equivalent fix that doesn't do any inelegant swapping
out of the BaseStack. It also doesn't change `goxpp`'s public API by
essentially copying `XmlBaseResolveUrl` to `gofeed`.
  • Loading branch information
cristoper committed Mar 1, 2024
1 parent 8340fbd commit 9455e2b
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 11 deletions.
10 changes: 6 additions & 4 deletions atom/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,8 @@ func (ap *Parser) parseAtomText(p *xpp.XMLPullParser) (string, error) {
InnerXML string `xml:",innerxml"`
}

// get current base URL before it is clobbered by DecodeElement
base := p.BaseStack.Top()
err := p.DecodeElement(&text)
if err != nil {
return "", err
Expand All @@ -672,7 +674,7 @@ func (ap *Parser) parseAtomText(p *xpp.XMLPullParser) (string, error) {
if strings.Contains(result, "<![CDATA[") {
result = shared.StripCDATA(result)
if lowerType == "html" || strings.Contains(lowerType, "xhtml") {
result, _ = shared.ResolveHTML(p, result)
result, _ = shared.ResolveHTML(base, result)
}
} else {
// decode non-CDATA contents depending on type
Expand All @@ -683,12 +685,12 @@ func (ap *Parser) parseAtomText(p *xpp.XMLPullParser) (string, error) {
result, err = shared.DecodeEntities(result)
} else if strings.Contains(lowerType, "xhtml") {
result = ap.stripWrappingDiv(result)
result, _ = shared.ResolveHTML(p, result)
result, _ = shared.ResolveHTML(base, result)
} else if lowerType == "html" {
result = ap.stripWrappingDiv(result)
result, err = shared.DecodeEntities(result)
if err == nil {
result, _ = shared.ResolveHTML(p, result)
result, _ = shared.ResolveHTML(base, result)
}
} else {
decodedStr, err := base64.StdEncoding.DecodeString(result)
Expand All @@ -701,7 +703,7 @@ func (ap *Parser) parseAtomText(p *xpp.XMLPullParser) (string, error) {
// resolve relative URIs in URI-containing elements according to xml:base
name := strings.ToLower(p.Name)
if atomUriElements[name] {
resolved, err := p.XmlBaseResolveUrl(result)
resolved, err := shared.XmlBaseResolveUrl(base, result)
if resolved != nil && err == nil {
result = resolved.String()
}
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ go 1.19
require (
github.com/PuerkitoBio/goquery v1.8.0
github.com/json-iterator/go v1.1.12
github.com/mmcdole/goxpp v1.1.0
github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23
github.com/stretchr/testify v1.8.1
github.com/urfave/cli v1.22.3
golang.org/x/net v0.4.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/mmcdole/goxpp v1.1.0 h1:WwslZNF7KNAXTFuzRtn/OKZxFLJAAyOA9w82mDz2ZGI=
github.com/mmcdole/goxpp v1.1.0/go.mod h1:v+25+lT2ViuQ7mVxcncQ8ch1URund48oH+jhjiwEgS8=
github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 h1:Zr92CAlFhy2gL+V1F+EyIuzbQNbSgP4xhTODZtrXUtk=
github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23/go.mod h1:v+25+lT2ViuQ7mVxcncQ8ch1URund48oH+jhjiwEgS8=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
Expand Down
29 changes: 25 additions & 4 deletions internal/shared/xmlbase.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package shared
import (
"bytes"
"fmt"
"net/url"
"strings"

xpp "github.com/mmcdole/goxpp"
Expand Down Expand Up @@ -83,7 +84,7 @@ func resolveAttrs(p *xpp.XMLPullParser) error {
for i, attr := range p.Attrs {
lowerName := strings.ToLower(attr.Name.Local)
if uriAttrs[lowerName] {
absURL, err := p.XmlBaseResolveUrl(attr.Value)
absURL, err := XmlBaseResolveUrl(p.BaseStack.Top(), attr.Value)
if err != nil {
return err
}
Expand All @@ -95,11 +96,31 @@ func resolveAttrs(p *xpp.XMLPullParser) error {
return nil
}

// resolve u relative to b
func XmlBaseResolveUrl(b *url.URL, u string) (*url.URL, error) {
relURL, err := url.Parse(u)
if err != nil {
return nil, err
}

if b == nil {
return relURL, nil
}

if b.Path != "" && u != "" && b.Path[len(b.Path)-1] != '/' {
// There's no reason someone would use a path in xml:base if they
// didn't mean for it to be a directory
b.Path = b.Path + "/"
}
absURL := b.ResolveReference(relURL)
return absURL, nil
}

// Transforms html by resolving any relative URIs in attributes
// if an error occurs during parsing or serialization, then the original string
// is returned along with the error.
func ResolveHTML(p *xpp.XMLPullParser, relHTML string) (string, error) {
if p.BaseStack.Top() == nil {
func ResolveHTML(base *url.URL, relHTML string) (string, error) {
if base == nil {
return relHTML, nil
}

Expand All @@ -117,7 +138,7 @@ func ResolveHTML(p *xpp.XMLPullParser, relHTML string) (string, error) {
if n.Type == html.ElementNode {
for i, a := range n.Attr {
if htmlURIAttrs[a.Key] {
absVal, err := p.XmlBaseResolveUrl(a.Val)
absVal, err := XmlBaseResolveUrl(base, a.Val)
if absVal != nil && err == nil {
n.Attr[i].Val = absVal.String()
}
Expand Down

0 comments on commit 9455e2b

Please sign in to comment.