diff --git a/grab/spider/base.py b/grab/spider/base.py index 60922ff9..9992b675 100644 --- a/grab/spider/base.py +++ b/grab/spider/base.py @@ -242,7 +242,7 @@ def setup_grab(self, **kwargs): def check_task_limits(self, task): """ - Check that network/try counters are OK. + Check that task's network & try counters do not exceed limits. Returns: * if success: (True, None) @@ -397,9 +397,6 @@ def load_new_task(self): logger_verbose.debug('Task queue is empty.') return None - def process_task_counters(self, task): - task.network_try_count += 1 - def create_grab_instance(self, **kwargs): # Back-ward compatibility for deprecated `grab_config` attribute # Here I use `_grab_config` to not trigger warning messages @@ -701,8 +698,8 @@ def process_new_task(self, task): logger_verbose.debug('Submitting task to the transport ' 'layer') try: - self.transport.process_task(task, grab, - grab_config_backup) + self.transport.start_task_processing( + task, grab, grab_config_backup) except GrabInvalidUrl: logger.debug('Task %s has invalid URL: %s' % ( task.name, task.url)) @@ -797,10 +794,12 @@ def run(self): else: logger_verbose.debug('Got new task from task queue: %s' % task) - self.process_task_counters(task) + task.network_try_count += 1 is_valid, reason = self.check_task_limits(task) - if not is_valid: + if is_valid: + self.process_new_task(task) + else: logger_verbose.debug('Task %s is rejected due to ' '%s limit' % (task.name, reason)) @@ -817,9 +816,6 @@ def run(self): handler = task.get_fallback_handler(self) if handler: handler(task) - else: - self.process_new_task(task) - self.transport.process_handlers() with self.timer.log_time('network_transport'): logger_verbose.debug('Asking transport layer to do ' diff --git a/grab/spider/cache_backend/mysql.py b/grab/spider/cache_backend/mysql.py index 96df7526..8f840165 100644 --- a/grab/spider/cache_backend/mysql.py +++ b/grab/spider/cache_backend/mysql.py @@ -75,7 +75,7 @@ def get_item(self, url, timeout=None): """ _hash = self.build_hash(url) - with self.spider.stat.log_time('cache.read.mysql_query'): + with self.spider.timer.log_time('cache.read.mysql_query'): self.execute('BEGIN') if timeout is None: query = "" @@ -97,12 +97,12 @@ def get_item(self, url, timeout=None): return None def unpack_database_value(self, val): - with self.spider.stat.log_time('cache.read.unpack_data'): + with self.spider.timer.log_time('cache.read.unpack_data'): dump = zlib.decompress(val) return marshal.loads(dump) def build_hash(self, url): - with self.spider.stat.log_time('cache.read.build_hash'): + with self.spider.timer.log_time('cache.read.build_hash'): utf_url = make_str(url) return sha1(utf_url).hexdigest() @@ -176,7 +176,7 @@ def has_item(self, url, timeout=None): """ _hash = self.build_hash(url) - with self.spider.stat.log_time('cache.read.mysql_query'): + with self.spider.timer.log_time('cache.read.mysql_query'): if timeout is None: query = "" else: diff --git a/grab/spider/cache_backend/postgresql.py b/grab/spider/cache_backend/postgresql.py index 4ad38670..f3d57909 100644 --- a/grab/spider/cache_backend/postgresql.py +++ b/grab/spider/cache_backend/postgresql.py @@ -65,7 +65,7 @@ def get_item(self, url, timeout=None): """ _hash = self.build_hash(url) - with self.spider.stat.log_time('cache.read.postgresql_query'): + with self.spider.timer.log_time('cache.read.postgresql_query'): self.cursor.execute('BEGIN') if timeout is None: query = "" @@ -87,12 +87,12 @@ def get_item(self, url, timeout=None): return None def unpack_database_value(self, val): - with self.spider.stat.log_time('cache.read.unpack_data'): + with self.spider.timer.log_time('cache.read.unpack_data'): dump = zlib.decompress(val) return marshal.loads(dump) def build_hash(self, url): - with self.spider.stat.log_time('cache.read.build_hash'): + with self.spider.timer.log_time('cache.read.build_hash'): utf_url = make_str(url) return sha1(utf_url).hexdigest() @@ -170,7 +170,7 @@ def has_item(self, url, timeout=None): """ _hash = self.build_hash(url) - with self.spider.stat.log_time('cache.read.postgresql_query'): + with self.spider.timer.log_time('cache.read.postgresql_query'): if timeout is None: query = "" else: diff --git a/grab/spider/transport/multicurl.py b/grab/spider/transport/multicurl.py index 50a3ffad..f723e78b 100644 --- a/grab/spider/transport/multicurl.py +++ b/grab/spider/transport/multicurl.py @@ -42,7 +42,7 @@ def process_connection_count(self, curl): else: return curl - def process_task(self, task, grab, grab_config_backup): + def start_task_processing(self, task, grab, grab_config_backup): curl = self.process_connection_count(self.freelist.pop()) self.registry[id(curl)] = { diff --git a/test/spider_stat.py b/test/spider_stat.py index 5a9645fd..3224bba7 100644 --- a/test/spider_stat.py +++ b/test/spider_stat.py @@ -45,7 +45,7 @@ class TestSpider(Spider): pass bot = TestSpider() - self.assertRaises(KeyError, bot.stop_timer, 'zzz') + self.assertRaises(KeyError, bot.timer.stop, 'zzz') def test_counters_and_collections(self): from grab.stat import DEFAULT_COUNTER_KEY