-
Notifications
You must be signed in to change notification settings - Fork 118
/
_5_ContinuousCrawler.kt
30 lines (24 loc) · 1 KB
/
_5_ContinuousCrawler.kt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
package ai.platon.pulsar.examples
import ai.platon.pulsar.browser.common.BrowserSettings
import ai.platon.pulsar.common.LinkExtractors
import ai.platon.pulsar.skeleton.context.PulsarContexts
import ai.platon.pulsar.skeleton.crawl.common.url.ParsableHyperlink
import ai.platon.pulsar.dom.FeaturedDocument
import ai.platon.pulsar.persist.WebPage
/**
* Demonstrates continuous crawls.
* */
fun main() {
BrowserSettings.enableOriginalPageContentAutoExporting()
val context = PulsarContexts.create()
val parseHandler = { _: WebPage, document: FeaturedDocument ->
// do something wonderful with the document
println(document.title + "\t|\t" + document.baseURI)
// extract more links from the document
context.submitAll(document.selectHyperlinks("a[href~=/dp/]"))
}
// change to seeds100.txt to crawl more
val urls = LinkExtractors.fromResource("seeds10.txt")
.map { ParsableHyperlink("$it -refresh", parseHandler) }
context.submitAll(urls).await()
}