/
func_spider.go
101 lines (88 loc) · 2.49 KB
/
func_spider.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
package crontab
import (
"errors"
// "github.com/astaxie/beego"
// "encoding/json"
// "github.com/astaxie/beego/httplib"
"github.com/astaxie/beego/logs"
// "github.com/astaxie/beego/orm"
"github.com/midoks/novelsearch/app/models"
// "regexp"
"strconv"
"strings"
"time"
)
func CronWebRuleSpider(v *models.AppItem, url string, ranges string, rule string, path_tpl string) {
timeStart := time.Now().Unix()
var (
start = 0
end = 1
err = errors.New("nil")
cur = "0"
cur_page = ""
)
list := strings.Split(ranges, ",")
start, err = strconv.Atoi(list[0])
if err != nil {
return
}
end, err = strconv.Atoi(list[1])
if err != nil {
return
}
//单页爬取
if start != end {
v.SpiderProgress = v.SpiderProgress + 1
logs.Info("网站(%s)采集:进度:%d, 结束在:%d", v.Name, v.SpiderProgress, end)
cur = strconv.Itoa(v.SpiderProgress)
cur_page = strings.Replace(url, "{$RANGE}", cur, -1)
}
logs.Warn("全站采集开始:url:%s", cur_page)
if content, errcur := getHttpData2Code(cur_page, v.PageCharset); errcur == nil {
list, errlist := RegNovelList(content, rule)
// logs.Warn(content, rule)
if errlist == nil {
if len(list) > 0 {
for j := 0; j < len(list); j++ {
url := strings.Replace(path_tpl, "{$ID}", list[j]["url"].(string), -1)
if !strings.HasPrefix(url, "https://") && !strings.HasPrefix(url, "http://") {
url = "http://" + url
}
CronPathInfo(v, url, list[j]["name"].(string))
}
} else {
v.SpiderProgress = 0
v.Update("SpiderProgress")
logs.Error("全站采集结束(重置)url:%s", cur_page)
return
}
} else {
logs.Error("全站采集错误:%s", cur_page, errlist, rule)
v.SpiderProgress = 0
v.Update("SpiderProgress")
logs.Error("全站采集结束(重置)url:%s", cur_page)
}
}
timeEnd := time.Now().Unix()
v.Update("SpiderProgress")
logs.Warn("全站采集结束:url:%s耗时:%d", cur_page, timeEnd-timeStart)
}
//首页爬取数据
func WebRuleSpider() error {
filters := make([]interface{}, 0)
filters = append(filters, "status", "1")
list, _ := models.ItemGetList(1, 10000, filters...)
if len(list) == 0 {
logs.Info("全站更新(无更新数据):end!")
return nil
}
for i := 0; i < len(list); i++ {
var r = list[i]
if r.SpiderExp != "" && r.SpiderRange != "" && r.SpiderRule != "" && r.PathTpl != "" {
go CronWebRuleSpider(r, r.SpiderExp, r.SpiderRange, r.SpiderRule, r.PathTpl)
} else {
logs.Info("全站更新(条件不足):end!")
}
}
return nil
}