This repository has been archived by the owner on Apr 9, 2022. It is now read-only.
/
htmlparse.go
160 lines (129 loc) · 4.1 KB
/
htmlparse.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
// Author: Liam Stanley <me@liamstanley.io>
// Docs: https://marill.liam.sh/
// Repo: https://github.com/Liamraystanley/marill
package scraper
import (
"fmt"
"io"
"net/url"
"regexp"
"strings"
"golang.org/x/net/html"
)
// getAttr pulls a specific attribute from a token/element
func getAttr(attr string, attrs []html.Attribute) (val string) {
for _, item := range attrs {
if item.Key == attr {
val = item.Val
break
}
}
return
}
// getSrc crawls the body of the Results page, yielding all img/script/link resources
// so they can later be fetched.
func getSrc(b io.Reader, parent *url.URL) (urls []string) {
urls = []string{}
z := html.NewTokenizer(b)
for {
// loop through all tokens in the html body response
tt := z.Next()
switch {
case tt == html.ErrorToken:
// this assumes that there are no further tokens -- end of document
stripURLDups(&urls)
return
case tt == html.StartTagToken || tt == html.SelfClosingTagToken:
t := z.Token()
var src string
switch t.Data {
case "link":
src = getAttr("href", t.Attr)
rel := getAttr("rel", t.Attr)
if len(rel) > 0 && strings.ToLower(rel) != "stylesheet" && strings.ToLower(rel) != "shortcut icon" {
continue
}
case "script":
src = getAttr("src", t.Attr)
case "img":
src = getAttr("src", t.Attr)
default:
continue
}
src = fmtTagLinks(src, parent)
if len(src) == 0 {
continue
}
urls = append(urls, src)
}
}
}
var nonPrefixMatch = regexp.MustCompile(`^[a-zA-Z]`)
// fmtTagLinks formats the resulting link from it's previous form -- may it
// be a relative link, absolute link, invalid, etc.
func fmtTagLinks(src string, parent *url.URL) string {
// this assumes that the resource is something along the lines of:
// http://something.com/ -- which we don't care about
if len(src) == 0 || strings.HasSuffix(src, "/") {
return ""
}
// add trailing slash to the end of the path
if len(parent.Path) == 0 {
parent.Path = "/"
}
// site was developed using sub-relative paths. E.g:
// - url: http://domain.com/sub/path and resource: something/main.js
// would equal http://domain.com/sub/path/something/main.js
if !strings.Contains(src, "//") && nonPrefixMatch.MatchString(src) {
src = fmt.Sprintf("%s://%s/%s", parent.Scheme, parent.Host+strings.TrimRight(parent.Path, "/"), src)
}
// site was developed using relative paths. E.g:
// - url: http://domain.com/sub/path and resource: ./something/main.js
// would equal http://domain.com/sub/path/something/main.js
if strings.HasPrefix(src, "./") {
src = fmt.Sprintf("%s://%s%s/%s", parent.Scheme, parent.Host, strings.TrimRight(parent.Path, "/"), strings.SplitN(src, "./", 2)[1])
}
// site is loading resources from a remote location that supports both
// http and https. browsers should natively tack on the current sites
// protocol to the url. E.g:
// - url: http://domain.com/ and resource: //other.com/some-resource.js
// generates: http://other.com/some-resource.js
// - url: https://domain.com/ and resource: //other.com/some-resource.js
// generates: https://other.com/some-resource.js
if strings.HasPrefix(src, "//") {
src = parent.Scheme + ":" + src
}
// non-host-absolute resource. E.g. resource is loaded based on the docroot
// of the domain. E.g:
// - url: http://domain.com/ and resource: /some-resource.js
// generates: http://domain.com/some-resource.js
// - url: https://domain.com/sub/resource and resource: /some-resource.js
// generates: https://domain.com/some-resource.js
if strings.HasPrefix(src, "/") {
src = parent.Scheme + "://" + parent.Host + src
}
// ignore anything else that isn't http based. E.g. ftp://, and other svg-like
// data urls, as we really can't fetch those.
if parent.Scheme != "http" && parent.Scheme != "https" {
return ""
}
return src
}
// stripURLDups strips all duplicate src URLs
func stripURLDups(domains *[]string) {
var tmp []string
for _, dom := range *domains {
isIn := false
for _, other := range tmp {
if dom == other {
isIn = true
break
}
}
if !isIn {
tmp = append(tmp, dom)
}
}
*domains = tmp
return
}