/
dev.clj
69 lines (63 loc) · 2.5 KB
/
dev.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
(ns skyscraper.dev
"Tools for interactive development of scrapers. See [doc/development-mode.md]
for an overview and example."
(:require
[clojure.core.async :refer [chan alts!!]]
[clojure.java.browse :refer [browse-url]]
[clojure.java.io :as io]
[skyscraper.core :as core]
[skyscraper.traverse :as traverse]
[taoensso.timbre :as log]))
(defn- browse-context
"Dumps the given context's response to a temporary file
and opens a browser on it."
[ctx]
(let [f (java.io.File/createTempFile "skyscraper-" ".html")]
(with-open [is (io/input-stream (get-in ctx [::core/response :body]))
os (io/output-stream f)]
(io/copy is os))
(browse-url f)))
(def ^:private scrape-data (atom nil))
(defn cleanup
"Runs a previously terminated [[scrape]] to completion."
[]
(when-let [{{:keys [item-chan terminate-chan]} :channels} @scrape-data]
(log/infof "Resuming suspended scrape to clean up")
(loop []
(let [alts-res (alts!! [item-chan terminate-chan])
[val port] alts-res]
(if (= port terminate-chan)
(reset! scrape-data nil)
(recur))))))
(defn scrape
"A variant of [[skyscraper.core/scrape!]] that will stop and open a
browser on the first encountered processor that isn't defined or doesn't
have a process-fn."
[seed & {:as options}]
(cleanup)
(let [item-chan (chan)
options (core/initialize-options (assoc options :item-chan item-chan :parallelism 1))
seed (core/initialize-seed options seed)
{:keys [terminate-chan] :as channels} (traverse/launch seed options)]
(loop []
(let [alts-res (alts!! [item-chan terminate-chan])
[val port] alts-res]
(if (= port terminate-chan)
nil
(if-let [{:keys [::core/resource ::core/context]} (first (filter #(::core/unimplemented %) val))]
(do (reset! scrape-data {:resource resource, :context context, :channels channels})
(browse-context context)
(log/infof "Scraping suspended in processor %s" (:processor context))
nil)
(recur)))))))
(defn document
"Returns the parsed document that the last invocation of [[scrape]]
has stopped on."
[]
(:resource @scrape-data))
(defn run-last-processor
"Calls the processor whose invocation caused [[scrape]] to stop."
[]
(if-let [{:keys [resource context]} @scrape-data]
(core/run-processor (:processor context) resource context)
(throw (ex-info "No interactive scraping in progress" {}))))