Skip to content

Commit

Permalink
Refactor RSS parser to use default namespace
Browse files Browse the repository at this point in the history
This change avoid some limitations of the Go XML parser regarding XML namespaces
  • Loading branch information
fguillot committed Mar 12, 2024
1 parent d3a85b0 commit 9a637ce
Show file tree
Hide file tree
Showing 6 changed files with 185 additions and 181 deletions.
1 change: 1 addition & 0 deletions internal/reader/media/media.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
var textLinkRegex = regexp.MustCompile(`(?mi)(\bhttps?:\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])`)

// Element represents XML media elements.
// Specs: https://www.rssboard.org/media-rss
type Element struct {
MediaGroups []Group `xml:"http://search.yahoo.com/mrss/ group"`
MediaContents []Content `xml:"http://search.yahoo.com/mrss/ content"`
Expand Down
43 changes: 43 additions & 0 deletions internal/reader/rss/atom.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

package rss // import "miniflux.app/v2/internal/reader/rss"

import "strings"

type AtomAuthor struct {
Author AtomPerson `xml:"http://www.w3.org/2005/Atom author"`
}

func (a *AtomAuthor) String() string {
return a.Author.String()
}

type AtomPerson struct {
Name string `xml:"name"`
Email string `xml:"email"`
}

func (a *AtomPerson) String() string {
var name string

switch {
case a.Name != "":
name = a.Name
case a.Email != "":
name = a.Email
}

return strings.TrimSpace(name)
}

type AtomLink struct {
URL string `xml:"href,attr"`
Type string `xml:"type,attr"`
Rel string `xml:"rel,attr"`
Length string `xml:"length,attr"`
}

type AtomLinks struct {
Links []*AtomLink `xml:"http://www.w3.org/2005/Atom link"`
}
4 changes: 3 additions & 1 deletion internal/reader/rss/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ import (
// Parse returns a normalized feed struct from a RSS feed.
func Parse(baseURL string, data io.ReadSeeker) (*model.Feed, error) {
feed := new(rssFeed)
if err := xml.NewXMLDecoder(data).Decode(feed); err != nil {
decoder := xml.NewXMLDecoder(data)
decoder.DefaultSpace = "rss"
if err := decoder.Decode(feed); err != nil {
return nil, fmt.Errorf("rss: unable to parse feed: %w", err)
}
return feed.Transform(baseURL), nil
Expand Down
69 changes: 34 additions & 35 deletions internal/reader/rss/parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ func TestParseEntryWithMultipleAtomLinks(t *testing.T) {
<item>
<title>Test</title>
<atom:link rel="payment" href="https://example.org/a" />
<atom:link rel="http://foobar.tld" href="https://example.org/b" />
<atom:link rel="alternate" href="https://example.org/b" />
</item>
</channel>
</rss>`
Expand Down Expand Up @@ -430,7 +430,7 @@ func TestParseEntryWithAuthorAndCDATA(t *testing.T) {
<title>Test</title>
<link>https://example.org/item</link>
<author>
by <![CDATA[Foo Bar]]>
<![CDATA[by Foo Bar]]>
</author>
</item>
</channel>
Expand All @@ -447,38 +447,6 @@ func TestParseEntryWithAuthorAndCDATA(t *testing.T) {
}
}

func TestParseEntryWithNonStandardAtomAuthor(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
<channel>
<title>Example</title>
<link>https://example.org/</link>
<atom:link href="https://example.org/rss" type="application/rss+xml" rel="self"></atom:link>
<item>
<title>Test</title>
<link>https://example.org/item</link>
<author xmlns:author="http://www.w3.org/2005/Atom">
<name>Foo Bar</name>
<title>Vice President</title>
<department/>
<company>FooBar Inc.</company>
</author>
</item>
</channel>
</rss>`

feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
if err != nil {
t.Fatal(err)
}

expected := "Foo Bar"
result := feed.Entries[0].Author
if result != expected {
t.Errorf("Incorrect entry author, got %q instead of %q", result, expected)
}
}

func TestParseEntryWithAtomAuthorEmail(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
Expand Down Expand Up @@ -508,7 +476,7 @@ func TestParseEntryWithAtomAuthorEmail(t *testing.T) {
}
}

func TestParseEntryWithAtomAuthor(t *testing.T) {
func TestParseEntryWithAtomAuthorName(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
<channel>
Expand Down Expand Up @@ -1435,6 +1403,37 @@ func TestEntryDescriptionFromGooglePlayDescription(t *testing.T) {
}
}

func TestParseEntryWithRSSDescriptionAndMediaDescription(t *testing.T) {
data := `<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
<channel>
<title>Podcast Example</title>
<link>http://www.example.com/index.html</link>
<item>
<title>Entry Title</title>
<link>http://www.example.com/entries/1</link>
<description>Entry Description</description>
<media:description type="plain">Media Description</media:description>
</item>
</channel>
</rss>`

feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
if err != nil {
t.Fatal(err)
}

if len(feed.Entries) != 1 {
t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries))
}

expected := "Entry Description"
result := feed.Entries[0].Content
if expected != result {
t.Errorf(`Unexpected description, got %q instead of %q`, result, expected)
}
}

func TestParseEntryWithCategoryAndInnerHTML(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
Expand Down
42 changes: 28 additions & 14 deletions internal/reader/rss/podcast.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,24 @@ var ErrInvalidDurationFormat = errors.New("rss: invalid duration format")
// PodcastFeedElement represents iTunes and GooglePlay feed XML elements.
// Specs:
// - https://github.com/simplepie/simplepie-ng/wiki/Spec:-iTunes-Podcast-RSS
// - https://developers.google.com/search/reference/podcast/rss-feed
// - https://support.google.com/podcast-publishers/answer/9889544
type PodcastFeedElement struct {
ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>author"`
Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>subtitle"`
Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>summary"`
PodcastOwner PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>owner"`
GooglePlayAuthor string `xml:"http://www.google.com/schemas/play-podcasts/1.0 channel>author"`
ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"`
Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"`
PodcastOwner PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner"`
GooglePlayAuthor string `xml:"http://www.google.com/schemas/play-podcasts/1.0 author"`
}

// PodcastEntryElement represents iTunes and GooglePlay entry XML elements.
type PodcastEntryElement struct {
Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"`
Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"`
Duration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
GooglePlayDescription string `xml:"http://www.google.com/schemas/play-podcasts/1.0 description"`
ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"`
Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"`
Duration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
PodcastOwner PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner"`
GooglePlayAuthor string `xml:"http://www.google.com/schemas/play-podcasts/1.0 author"`
GooglePlayDescription string `xml:"http://www.google.com/schemas/play-podcasts/1.0 description"`
}

// PodcastOwner represents contact information for the podcast owner.
Expand All @@ -38,6 +41,19 @@ type PodcastOwner struct {
Email string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd email"`
}

func (p *PodcastOwner) String() string {
var name string

switch {
case p.Name != "":
name = p.Name
case p.Email != "":
name = p.Email
}

return strings.TrimSpace(name)
}

// Image represents podcast artwork.
type Image struct {
URL string `xml:"href,attr"`
Expand All @@ -52,10 +68,8 @@ func (e *PodcastFeedElement) PodcastAuthor() string {
author = e.ItunesAuthor
case e.GooglePlayAuthor != "":
author = e.GooglePlayAuthor
case e.PodcastOwner.Name != "":
author = e.PodcastOwner.Name
case e.PodcastOwner.Email != "":
author = e.PodcastOwner.Email
case e.PodcastOwner.String() != "":
author = e.PodcastOwner.String()
}

return strings.TrimSpace(author)
Expand Down

0 comments on commit 9a637ce

Please sign in to comment.