-
Notifications
You must be signed in to change notification settings - Fork 0
/
parallel.go
92 lines (78 loc) · 2.21 KB
/
parallel.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
package main
import (
"fmt"
"os"
"os/signal"
"syscall"
"net"
"log"
"github.com/misterpilou/playweb/colly"
"google.golang.org/grpc"
)
type parserServer struct {
colly.UnimplementedParserServer
savedParsed []*colly.Parsed
}
// const (
// port = "127.0.0.1:50051"
// )
func main() {
sig := make(chan os.Signal)
signal.Notify(sig, os.Interrupt, syscall.SIGTERM)
// Instantiate default collector
c := colly.NewCollector(
// MaxDepth is 2, so only the links on the scraped page
// and links on those pages are visited
colly.MaxDepth(2),
colly.Async(true),
)
// Limit the maximum parallelism to 2
// This is necessary if the goroutines are dynamically
// created to control the limit of simultaneous requests.
//
// Parallelism can be controlled also by spawning fixed
// number of go routines.
c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 8})
i := 0
j := 0
c.OnResponse(func(res *colly.Response) {
//fmt.Println(string(res.Body))
if (res.StatusCode == 200) {
i++
}
j++
})
c.OnRequest(func(req *colly.Request) {
fmt.Println(req.URL)
})
// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
// Print link
// fmt.Println(link)
// Visit link found on page on a new thread
// e.Request.Visit(link)
c.Visit(link)
})
// Start scraping on https://en.wikipedia.org
c.Visit("https://en.wikipedia.org/")
// Wait until threads are finished
go func() {
<-sig
fmt.Println("number of 200 ", i)
fmt.Println("number of request ", j)
os.Exit(1)
}()
c.Wait()
lis, err := net.Listen("tcp4", ":50017")
if err != nil {
log.Fatalf("failed to listen: %v", err)
}
grpcServer := grpc.NewServer()
colly.RegisterParserServer(grpcServer, &parserServer{})
//determine whether to use TLS
grpcServer.Serve(lis)
if err != nil {
log.Fatalf("Error: %v", err)
}
}