Skip to content

Commit

Permalink
Small refactoring. Bugfix.
Browse files Browse the repository at this point in the history
  • Loading branch information
lorien committed May 23, 2015
1 parent 6e70b6b commit 8e5ebb1
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 21 deletions.
18 changes: 7 additions & 11 deletions grab/spider/base.py
Expand Up @@ -242,7 +242,7 @@ def setup_grab(self, **kwargs):

def check_task_limits(self, task):
"""
Check that network/try counters are OK.
Check that task's network & try counters do not exceed limits.
Returns:
* if success: (True, None)
Expand Down Expand Up @@ -397,9 +397,6 @@ def load_new_task(self):
logger_verbose.debug('Task queue is empty.')
return None

def process_task_counters(self, task):
task.network_try_count += 1

def create_grab_instance(self, **kwargs):
# Back-ward compatibility for deprecated `grab_config` attribute
# Here I use `_grab_config` to not trigger warning messages
Expand Down Expand Up @@ -701,8 +698,8 @@ def process_new_task(self, task):
logger_verbose.debug('Submitting task to the transport '
'layer')
try:
self.transport.process_task(task, grab,
grab_config_backup)
self.transport.start_task_processing(
task, grab, grab_config_backup)
except GrabInvalidUrl:
logger.debug('Task %s has invalid URL: %s' % (
task.name, task.url))
Expand Down Expand Up @@ -797,10 +794,12 @@ def run(self):
else:
logger_verbose.debug('Got new task from task queue: %s'
% task)
self.process_task_counters(task)
task.network_try_count += 1

is_valid, reason = self.check_task_limits(task)
if not is_valid:
if is_valid:
self.process_new_task(task)
else:
logger_verbose.debug('Task %s is rejected due to '
'%s limit'
% (task.name, reason))
Expand All @@ -817,9 +816,6 @@ def run(self):
handler = task.get_fallback_handler(self)
if handler:
handler(task)
else:
self.process_new_task(task)
self.transport.process_handlers()

with self.timer.log_time('network_transport'):
logger_verbose.debug('Asking transport layer to do '
Expand Down
8 changes: 4 additions & 4 deletions grab/spider/cache_backend/mysql.py
Expand Up @@ -75,7 +75,7 @@ def get_item(self, url, timeout=None):
"""

_hash = self.build_hash(url)
with self.spider.stat.log_time('cache.read.mysql_query'):
with self.spider.timer.log_time('cache.read.mysql_query'):
self.execute('BEGIN')
if timeout is None:
query = ""
Expand All @@ -97,12 +97,12 @@ def get_item(self, url, timeout=None):
return None

def unpack_database_value(self, val):
with self.spider.stat.log_time('cache.read.unpack_data'):
with self.spider.timer.log_time('cache.read.unpack_data'):
dump = zlib.decompress(val)
return marshal.loads(dump)

def build_hash(self, url):
with self.spider.stat.log_time('cache.read.build_hash'):
with self.spider.timer.log_time('cache.read.build_hash'):
utf_url = make_str(url)
return sha1(utf_url).hexdigest()

Expand Down Expand Up @@ -176,7 +176,7 @@ def has_item(self, url, timeout=None):
"""

_hash = self.build_hash(url)
with self.spider.stat.log_time('cache.read.mysql_query'):
with self.spider.timer.log_time('cache.read.mysql_query'):
if timeout is None:
query = ""
else:
Expand Down
8 changes: 4 additions & 4 deletions grab/spider/cache_backend/postgresql.py
Expand Up @@ -65,7 +65,7 @@ def get_item(self, url, timeout=None):
"""

_hash = self.build_hash(url)
with self.spider.stat.log_time('cache.read.postgresql_query'):
with self.spider.timer.log_time('cache.read.postgresql_query'):
self.cursor.execute('BEGIN')
if timeout is None:
query = ""
Expand All @@ -87,12 +87,12 @@ def get_item(self, url, timeout=None):
return None

def unpack_database_value(self, val):
with self.spider.stat.log_time('cache.read.unpack_data'):
with self.spider.timer.log_time('cache.read.unpack_data'):
dump = zlib.decompress(val)
return marshal.loads(dump)

def build_hash(self, url):
with self.spider.stat.log_time('cache.read.build_hash'):
with self.spider.timer.log_time('cache.read.build_hash'):
utf_url = make_str(url)
return sha1(utf_url).hexdigest()

Expand Down Expand Up @@ -170,7 +170,7 @@ def has_item(self, url, timeout=None):
"""

_hash = self.build_hash(url)
with self.spider.stat.log_time('cache.read.postgresql_query'):
with self.spider.timer.log_time('cache.read.postgresql_query'):
if timeout is None:
query = ""
else:
Expand Down
2 changes: 1 addition & 1 deletion grab/spider/transport/multicurl.py
Expand Up @@ -42,7 +42,7 @@ def process_connection_count(self, curl):
else:
return curl

def process_task(self, task, grab, grab_config_backup):
def start_task_processing(self, task, grab, grab_config_backup):
curl = self.process_connection_count(self.freelist.pop())

self.registry[id(curl)] = {
Expand Down
2 changes: 1 addition & 1 deletion test/spider_stat.py
Expand Up @@ -45,7 +45,7 @@ class TestSpider(Spider):
pass

bot = TestSpider()
self.assertRaises(KeyError, bot.stop_timer, 'zzz')
self.assertRaises(KeyError, bot.timer.stop, 'zzz')

def test_counters_and_collections(self):
from grab.stat import DEFAULT_COUNTER_KEY
Expand Down

0 comments on commit 8e5ebb1

Please sign in to comment.