/
feed.go
117 lines (96 loc) · 2.57 KB
/
feed.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
package crawler
import (
"fmt"
"strings"
"github.com/mmcdole/gofeed"
"golang.org/x/net/html"
"errors"
)
func (c *Crawler) GetFeedLink() (string, string, error) {
if err := c.FromAuto(); err != nil {
return "", "", err
}
if c.source == nil {
return "", "", errors.New("No source available!")
}
if c.contentType == "" {
if err := c.Detect(); err != nil {
return "", "", err
}
if c.contentType == "" {
return "", "", errors.New("Could not detect content type!")
}
}
if strings.Contains(c.contentType, "text/xml") {
return "", c.sourceLocation, nil
} else if strings.Contains(c.contentType, "text/html") {
return c.GetFeedLinkFromHTML()
}
return "", "", errors.New("No feed link found")
}
func (c *Crawler) GetFeedLinkFromHTML() (string, string, error) {
doc, err := html.Parse(c.source)
if err != nil {
return "", "", err
}
var f func(*html.Node) (bool, string, string)
f = func(n *html.Node) (bool, string, string) {
if n.Type == html.ElementNode && n.Data == "link" {
var feedType *string = nil
var feedHref *string = nil
for i := 0; i < len(n.Attr); i++ {
attr := n.Attr[i]
if attr.Key == "type" {
if strings.Contains(attr.Val, "rss") || strings.Contains(attr.Val, "atom") {
feedType = &attr.Val
}
} else if attr.Key == "href" {
feedHref = &attr.Val
}
}
if feedType != nil && feedHref != nil {
return true, *feedType, *feedHref
}
return false, "", ""
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
fF, fT, fH := f(c)
if fF == true {
return fF, fT, fH
}
}
return false, "", ""
}
found, feedType, feedHref := f(doc)
if found == true {
if strings.HasPrefix(feedHref, "./") {
feedHref = fmt.Sprintf(
"%s/%s",
strings.TrimRight(c.sourceLocation, "/"),
strings.TrimLeft(feedHref, "./"),
)
} else if strings.HasPrefix(feedHref, "/") {
feedHref = fmt.Sprintf(
"%s/%s",
strings.TrimRight(c.sourceLocation, "/"),
strings.TrimLeft(feedHref, "/"),
)
}
return feedType, feedHref, nil
}
return "", "", errors.New("No feed URL found!")
}
func (c* Crawler) ParseFeed() (*gofeed.Feed, error) {
if err := c.FromAuto(); err != nil {
return nil, err
}
if c.source == nil {
return nil, errors.New("No source available!")
}
gfp := gofeed.NewParser()
feed, err := gfp.Parse(c.source)
if err != nil {
return nil, err
}
return feed, nil
}