-
Notifications
You must be signed in to change notification settings - Fork 0
/
sitemapparser.go
111 lines (93 loc) · 2.18 KB
/
sitemapparser.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
package sitemapparser
import (
"bytes"
"compress/gzip"
"errors"
"log"
"net/http"
"github.com/beevik/etree"
)
// @return sitemapped url which all
func Scheduler(url string) ([]string, error) {
// download and parse xml
sitemapXML, err := Downloader(url)
if err != nil {
log.Fatal(err)
}
parsedURLs, isSitemap, err := Parser(sitemapXML)
if err != nil {
log.Fatal(err)
}
parsedSiteURLs := []string{}
if isSitemap {
for _, indexURL := range parsedURLs {
parsedURLs2, err := Scheduler(indexURL)
if err != nil {
log.Fatal(err)
}
for _, parsedURL := range parsedURLs2 {
parsedSiteURLs = append(parsedSiteURLs, parsedURL)
}
}
}
if len(parsedSiteURLs) > 0 {
parsedURLs = parsedSiteURLs
}
return parsedURLs, err
}
func Downloader(url string) (string, error) {
log.Printf("start download: %s", url)
client := new(http.Client)
request, err := http.NewRequest("GET", url, nil)
if err != nil {
log.Fatal(err)
return "", err
}
response, err := client.Do(request)
if err != nil {
log.SetFlags(log.Lshortfile)
log.Fatal(err)
return "", err
}
defer response.Body.Close()
// unzip
reader, err := gzip.NewReader(response.Body)
if err != nil {
log.SetFlags(log.Lshortfile)
log.Fatal(err)
return "", err
}
defer reader.Close()
out := bytes.Buffer{}
out.ReadFrom(reader)
s := string(out.Bytes())
return s, nil
}
// @str downloaded sitemap
// @return
func Parser(sitemapXML string) ([]string, bool, error) {
isSitemapIndex := false
xmlStr := sitemapXML
doc := etree.NewDocument()
if err := doc.ReadFromString(xmlStr); err != nil {
log.Fatal(err)
return nil, isSitemapIndex, err
}
sitemapSet := doc.SelectElement("sitemapindex")
urlSet := doc.SelectElement("urlset")
siteUrls := []string{}
sitemaps := []*etree.Element{}
if urlSet != nil {
sitemaps = urlSet.SelectElements("url")
} else if sitemapSet != nil {
isSitemapIndex = true
sitemaps = sitemapSet.SelectElements("sitemap")
} else {
return []string{}, false, errors.New("something wrong string: " + sitemapXML)
}
for _, sitemap := range sitemaps {
loc := sitemap.SelectElement("loc")
siteUrls = append(siteUrls, loc.Text())
}
return siteUrls, isSitemapIndex, nil
}