This repository has been archived by the owner on Feb 18, 2019. It is now read-only.
/
parse.py
198 lines (163 loc) · 7.09 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
from w3lib.url import is_url
from scrapy.command import ScrapyCommand
from scrapy.http import Request
from scrapy.item import BaseItem
from scrapy.utils import display
from scrapy.utils.spider import iterate_spider_output, create_spider_for_request
from scrapy.exceptions import UsageError
from scrapy import log
class Command(ScrapyCommand):
requires_project = True
spider = None
items = {}
requests = {}
first_response = None
def syntax(self):
return "[options] <url>"
def short_desc(self):
return "Parse URL (using its spider) and print the results"
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("--spider", dest="spider", default=None, \
help="use this spider without looking for one")
parser.add_option("--nolinks", dest="nolinks", action="store_true", \
help="don't show links to follow (extracted requests)")
parser.add_option("--noitems", dest="noitems", action="store_true", \
help="don't show scraped items")
parser.add_option("--nocolour", dest="nocolour", action="store_true", \
help="avoid using pygments to colorize the output")
parser.add_option("-r", "--rules", dest="rules", action="store_true", \
help="use CrawlSpider rules to discover the callback")
parser.add_option("-c", "--callback", dest="callback", \
help="use this callback for parsing, instead looking for a callback")
parser.add_option("-d", "--depth", dest="depth", type="int", default=1, \
help="maximum depth for parsing requests [default: %default]")
parser.add_option("-v", "--verbose", dest="verbose", action="store_true", \
help="print each depth level one by one")
@property
def max_level(self):
levels = self.items.keys() + self.requests.keys()
if levels: return max(levels)
else: return 0
def add_items(self, lvl, new_items):
old_items = self.items.get(lvl, [])
self.items[lvl] = old_items + new_items
def add_requests(self, lvl, new_reqs):
old_reqs = self.requests.get(lvl, [])
self.requests[lvl] = old_reqs + new_reqs
def print_items(self, lvl=None, colour=True):
if lvl is None:
items = [item for lst in self.items.values() for item in lst]
else:
items = self.items.get(lvl, [])
print "# Scraped Items ", "-"*60
display.pprint([dict(x) for x in items], colorize=colour)
def print_requests(self, lvl=None, colour=True):
if lvl is None:
levels = self.requests.keys()
if levels:
requests = self.requests[max(levels)]
else:
requests = []
else:
requests = self.requests.get(lvl, [])
print "# Requests ", "-"*65
display.pprint(requests, colorize=colour)
def print_results(self, opts):
colour = not opts.nocolour
if opts.verbose:
for level in xrange(1, self.max_level+1):
print '\n>>> DEPTH LEVEL: %s <<<' % level
if not opts.noitems:
self.print_items(level, colour)
if not opts.nolinks:
self.print_requests(level, colour)
else:
print '\n>>> STATUS DEPTH LEVEL %s <<<' % self.max_level
if not opts.noitems:
self.print_items(colour=colour)
if not opts.nolinks:
self.print_requests(colour=colour)
def run_callback(self, response, cb):
items, requests = [], []
for x in iterate_spider_output(cb(response)):
if isinstance(x, BaseItem):
items.append(x)
elif isinstance(x, Request):
requests.append(x)
return items, requests
def get_callback_from_rules(self, response):
if getattr(self.spider, 'rules', None):
for rule in self.spider.rules:
if rule.link_extractor.matches(response.url) and rule.callback:
return rule.callback
else:
log.msg(format='No CrawlSpider rules found in spider %(spider)r, '
'please specify a callback to use for parsing',
level=log.ERROR, spider=self.spider.name)
def set_spider(self, url, opts):
if opts.spider:
try:
self.spider = self.crawler.spiders.create(opts.spider)
except KeyError:
log.msg(format='Unable to find spider: %(spider)s',
level=log.ERROR, spider=opts.spider)
else:
self.spider = create_spider_for_request(self.crawler.spiders, Request(url))
if not self.spider:
log.msg(format='Unable to find spider for: %(url)s',
level=log.ERROR, url=url)
def start_parsing(self, url, opts):
request = Request(url, opts.callback)
request = self.prepare_request(request, opts)
self.crawler.crawl(self.spider, [request])
self.crawler.start()
if not self.first_response:
log.msg(format='No response downloaded for: %(request)s',
level=log.ERROR, request=request)
def prepare_request(self, request, opts):
def callback(response):
# memorize first request
if not self.first_response:
self.first_response = response
# determine real callback
cb = response.meta['_callback']
if not cb:
if opts.rules and self.first_response == response:
cb = self.get_callback_from_rules(response)
else:
cb = 'parse'
if not callable(cb):
cb_method = getattr(self.spider, cb, None)
if callable(cb_method):
cb = cb_method
else:
log.msg(format='Cannot find callback %(callback)r in spider: %(spider)s',
callback=callback, spider=self.spider.name, level=log.ERROR)
return
# parse items and requests
depth = response.meta['_depth']
items, requests = self.run_callback(response, cb)
self.add_items(depth, items)
self.add_requests(depth, requests)
if depth < opts.depth:
for req in requests:
req.meta['_depth'] = depth + 1
req.meta['_callback'] = req.callback
req.callback = callback
return requests
request.meta['_depth'] = 1
request.meta['_callback'] = request.callback
request.callback = callback
return request
def run(self, args, opts):
# parse arguments
if not len(args) == 1 or not is_url(args[0]):
raise UsageError()
else:
url = args[0]
# prepare spider
self.set_spider(url, opts)
if self.spider and opts.depth > 0:
self.start_parsing(url, opts)
self.print_results(opts)