/
action.go
139 lines (105 loc) · 2.51 KB
/
action.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
package index
import (
"bufio"
"errors"
"net/http"
"net/url"
"regexp"
"strings"
"golang.org/x/net/html/charset"
"gopkg.in/xmlpath.v2"
"github.com/muniere/glean/internal/app/server/batch/log"
"github.com/muniere/glean/internal/pkg/urls"
)
type command struct {
uri *url.URL
}
func Perform(uri *url.URL, options Options) (*SiteInfo, error) {
cmd := command{uri: uri}
ctx := compose(cmd, options)
doc, err := fetch(ctx, options)
if err != nil {
return nil, err
}
return scrape(doc, ctx, options)
}
func compose(cmd command, options Options) context {
return context{uri: cmd.uri}
}
func fetch(context context, options Options) (*xmlpath.Node, error) {
log.Start(context.dict())
defer log.Finish(context.dict())
res, err := http.Get(context.uri.String())
if err != nil {
return nil, err
}
defer func() {
_ = res.Body.Close()
}()
if res.StatusCode != 200 {
return nil, errors.New(res.Status)
}
r := bufio.NewReader(res.Body)
data, err := r.Peek(1024)
if err != nil {
return nil, err
}
enc, _, ok := charset.DetermineEncoding(data, res.Header.Get("Content-Type"))
if ok {
return xmlpath.ParseHTML(enc.NewDecoder().Reader(r))
} else {
return xmlpath.ParseHTML(res.Body)
}
}
func scrape(doc *xmlpath.Node, context context, options Options) (*SiteInfo, error) {
log.Start(context.dict())
defer log.Finish(context.dict())
title := scrapeTitle(doc)
re := regexp.MustCompile(".*\\.(jpg|png|gif)")
hrefs, err := scrapeURLs(doc, "//a/@href", re, options.Grep)
if err != nil {
return nil, err
}
srcs, err := scrapeURLs(doc, "//img/@src", re, options.Grep)
if err != nil {
return nil, err
}
links := urls.Unique(append(hrefs, srcs...))
log.Result(len(links), context.dict())
info := SiteInfo{
URI: context.uri,
Title: title,
Links: links,
}
return &info, nil
}
func scrapeTitle(doc *xmlpath.Node) string {
xpath := xmlpath.MustCompile("//title")
iter := xpath.Iter(doc)
if iter.Next() {
return iter.Node().String()
} else {
return ""
}
}
func scrapeURLs(doc *xmlpath.Node, path string, pattern *regexp.Regexp, grep *regexp.Regexp) ([]*url.URL, error) {
var result []*url.URL
xpath := xmlpath.MustCompile(path)
iter := xpath.Iter(doc)
for iter.Next() {
val := iter.Node().String()
if pattern != nil && !pattern.MatchString(val) {
continue
}
if grep != nil && !grep.MatchString(val) {
continue
}
s := strings.Replace(val, " ", "+", -1)
u, err := url.Parse(s)
if err != nil {
continue
}
result = append(result, u)
}
return result, nil
}