/
extract.go
151 lines (127 loc) · 3.06 KB
/
extract.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
package besticon
import (
"bytes"
"errors"
"fmt"
"net/url"
"regexp"
"sort"
"strings"
"github.com/PuerkitoBio/goquery"
)
var iconPaths = []string{
"/favicon.ico",
"/apple-touch-icon.png",
"/apple-touch-icon-precomposed.png",
}
const (
favIcon = "icon"
appleTouchIcon = "apple-touch-icon"
appleTouchIconPrecomposed = "apple-touch-icon-precomposed"
)
type empty struct{}
// Find all icons in this html. We use siteURL as the base url unless we detect
// another base url in <head>
func findIconLinks(siteURL *url.URL, html []byte) ([]string, error) {
doc, e := docFromHTML(html)
if e != nil {
return nil, e
}
baseURL := determineBaseURL(siteURL, doc)
// Use a map to avoid dups
links := make(map[string]empty)
// Add common, hard coded icon paths
for _, path := range iconPaths {
links[urlFromBase(baseURL, path)] = empty{}
}
// Add icons found in page
urls := extractIconTags(doc)
for _, u := range urls {
absoluteURL, e := absoluteURL(baseURL, u)
if e == nil {
links[absoluteURL] = empty{}
}
}
// Turn unique keys into array
var result []string
for u := range links {
result = append(result, u)
}
sort.Strings(result)
return result, nil
}
// What is the baseURL for this doc?
func determineBaseURL(siteURL *url.URL, doc *goquery.Document) *url.URL {
baseTagHref := extractBaseTag(doc)
if baseTagHref != "" {
baseTagURL, e := url.Parse(baseTagHref)
if e != nil {
return siteURL
}
return baseTagURL
}
return siteURL
}
// Convert bytes => doc
func docFromHTML(html []byte) (*goquery.Document, error) {
doc, e := goquery.NewDocumentFromReader(bytes.NewReader(html))
if e != nil || doc == nil {
return nil, errParseHTML
}
return doc, nil
}
var errParseHTML = errors.New("besticon: could not parse html")
// Find <head><base href="xxx">
func extractBaseTag(doc *goquery.Document) string {
href := ""
doc.Find("head base[href]").First().Each(func(i int, s *goquery.Selection) {
href, _ = s.Attr("href")
})
return href
}
var (
iconTypes = []string{favIcon, appleTouchIcon, appleTouchIconPrecomposed}
iconTypesRe = regexp.MustCompile(fmt.Sprintf("^(%s)$", strings.Join(regexpQuoteMetaArray(iconTypes), "|")))
)
// Find icons from doc using goquery
func extractIconTags(doc *goquery.Document) []string {
var hits []string
doc.Find("link[href][rel]").Each(func(i int, s *goquery.Selection) {
href := extractIconTag(s)
if href != "" {
hits = append(hits, href)
}
})
return hits
}
func extractIconTag(s *goquery.Selection) string {
// What sort of iconType is in this <rel>?
rel, _ := s.Attr("rel")
if rel == "" {
return ""
}
rel = strings.ToLower(rel)
var iconType string
for _, i := range strings.Fields(rel) {
if iconTypesRe.MatchString(i) {
iconType = i
break
}
}
if iconType == "" {
return ""
}
href, _ := s.Attr("href")
if href == "" {
return ""
}
return href
}
// regexp.QuoteMeta an array of strings
func regexpQuoteMetaArray(a []string) []string {
quoted := make([]string, len(a))
for i, s := range a {
quoted[i] = regexp.QuoteMeta(s)
}
return quoted
}