-
Notifications
You must be signed in to change notification settings - Fork 0
/
client.go
71 lines (58 loc) · 1.58 KB
/
client.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
package client
import (
"github.com/go-errors/errors"
"github.com/m1/smap/crawler"
"net/url"
)
const (
defaultUserAgent = "smap-v0.0.1"
defaultMaxWorkers = 50
)
// Client is the main entry point for this package, it
// enables you to define a crawling client then to crawl
// multiple urls with the same client/config
type Client struct {
Config *Config
}
// Config is the config for the client
type Config struct {
// MaxWorkers is the maximum numbers of workers for
// the crawling pool to have, ideally will
// be around 50-100
MaxWorkers int
// IgnoreRobotsTxt enables polite crawling to be turned
// off, don't recommended setting this to true
IgnoreRobotsTxt bool
// UserAgent is the user agent that the crawler will use
// defaults to `smap-v0.0.1`
UserAgent string
}
// New passes back a new client, populates the config
// with the default values if not set
func New(config *Config) (*Client, error) {
if config.MaxWorkers == 0 {
config.MaxWorkers = defaultMaxWorkers
}
if config.MaxWorkers < 0 {
return nil, errors.New("maxworkers must be above 0")
}
if config.UserAgent == "" {
config.UserAgent = defaultUserAgent
}
return &Client{
Config: config,
}, nil
}
// Crawl starts the crawling of the url and passes back the
// sitemap
func (c *Client) Crawl(url *url.URL) (crawler.SiteMap, error) {
if url.Path != "/" && url.Path != "" {
return nil, errors.New("url should be the base url")
}
cr := crawler.New(*url, c.Config.IgnoreRobotsTxt, c.Config.MaxWorkers, c.Config.UserAgent)
err := cr.Run()
if err != nil {
return nil, err
}
return cr.SiteMap, nil
}