-
Notifications
You must be signed in to change notification settings - Fork 0
/
html.go
89 lines (79 loc) · 2.02 KB
/
html.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
package worker
import (
"bytes"
"fmt"
"io"
"net/url"
"path/filepath"
"strings"
"golang.org/x/net/html"
)
func extractLinksFromHTML(pageURL string, body []byte) (linkSet map[string]struct{}, err error) {
pageURLParsed, err := url.Parse(pageURL)
if err != nil {
return nil, fmt.Errorf("page url parse: %w", err)
}
htmlLinkStrings, err := extractRawLinksFromHTML(body)
if err != nil {
return nil, fmt.Errorf("extract raw links from html: %w", err)
}
// Use a map to ensure uniqueness of links
linkSet = make(map[string]struct{})
// Convert to absolute
for _, htmlLinkString := range htmlLinkStrings {
if strings.HasPrefix(htmlLinkString, "http") {
linkSet[htmlLinkString] = struct{}{}
continue
}
absoluteHTMLink, err := pageURLParsed.Parse(htmlLinkString)
if err != nil {
// log.Println("WARN: page link parse error:", err, pageURL)
continue
}
linkSet[absoluteHTMLink.String()] = struct{}{}
}
return
}
func extractRawLinksFromHTML(body []byte) ([]string, error) {
var links []string
z := html.NewTokenizer(bytes.NewReader(body))
for {
tt := z.Next()
switch tt {
case html.ErrorToken:
if z.Err() == io.EOF {
return links, nil
}
return nil, z.Err()
case html.StartTagToken:
tagName, moreAttr := z.TagName()
if len(tagName) == 1 && tagName[0] == 'a' {
for moreAttr {
var key, val []byte
key, val, moreAttr = z.TagAttr()
if string(key) == "href" && len(val) != 0 {
valString := strings.TrimSpace(string(val))
if strings.HasPrefix(valString, "#") {
break
}
acceptedExtensions := []string{".asp", ".aspx", ".htm", ".html", ".jsp", ".jsx", ".php", ".php3", ".php4", ".php5", ".phtml"}
ext := filepath.Ext(valString)
if ext == "" || contains(acceptedExtensions, ext) {
links = append(links, valString)
}
break
}
}
}
}
}
}
// contains checks if a slice contains a string
func contains(s []string, str string) bool {
for _, v := range s {
if v == str {
return true
}
}
return false
}