Skip to content

Commit

Permalink
Support for custom proxy sources in spider
Browse files Browse the repository at this point in the history
  • Loading branch information
lorien committed May 7, 2015
1 parent e9dcd52 commit 64ba5c5
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 9 deletions.
23 changes: 15 additions & 8 deletions grab/spider/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from grab.spider.data import Data
from grab.spider.stat import SpiderStat
from grab.spider.transport.multicurl import MulticurlTransport
from grab.proxylist import ProxyList
from grab.proxylist import ProxyList, BaseProxySource
from grab.util.misc import camel_case_to_underscore
from weblib.encoding import make_str, make_unicode

Expand Down Expand Up @@ -920,18 +920,25 @@ def run(self):
self.stop_timer('total')
self.shutdown()

def load_proxylist(self, source, source_type, proxy_type='http',
def load_proxylist(self, source, source_type=None, proxy_type='http',
auto_init=True, auto_change=True,
**kwargs):
self.proxylist = ProxyList()
if source_type == 'text_file':
self.proxylist.load_file(source, proxy_type=proxy_type)
elif source_type == 'url':
self.proxylist.load_url(source, proxy_type=proxy_type)
if isinstance(source, BaseProxySource):
self.proxylist.set_source(source)
elif isinstance(source, six.string_types):
if source_type == 'text_file':
self.proxylist.load_file(source, proxy_type=proxy_type)
elif source_type == 'url':
self.proxylist.load_url(source, proxy_type=proxy_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source_type` argument: %s'
% source_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source_type` argument: '
% source_type)
'invalid `source` argument: %s'
% source)

self.proxylist_enabled = True
self.proxy = None
Expand Down
29 changes: 28 additions & 1 deletion test/spider_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from grab import Grab
from grab.spider import Spider, Task
from test.util import BaseGrabTestCase, TEST_SERVER_PORT
from grab.proxylist import BaseProxySource, Proxy

ADDRESS = '127.0.0.1'
EXTRA_PORT1 = TEST_SERVER_PORT + 1
Expand All @@ -18,7 +19,6 @@ def prepare(self):
self.ports = set()

def task_baz(self, grab, task):
print(grab.request_headers)
self.ports.add(int(grab.response.headers.get('Listen-Port', 0)))


Expand All @@ -29,6 +29,7 @@ def setUpClass(cls):
extra_ports=[EXTRA_PORT1, EXTRA_PORT2])
cls.server.start()

"""
def test_setup_proxylist(self):
content = '%s\n%s\n%s' % (PROXY1, PROXY2, PROXY3)
open('/tmp/__proxy.txt', 'w').write(content)
Expand Down Expand Up @@ -122,3 +123,29 @@ def test_setup_grab(self):
self.assertEqual(self.server.request['headers']['host'], 'yandex.ru')
self.assertTrue(EXTRA_PORT2 not in bot.ports)
"""

def test_spider_custom_proxy_source(self):
class TestSpider(Spider):
def prepare(self):
self.ports = set()

def task_page(self, grab, task):
self.ports.add(int(grab.response.headers.get('Listen-Port', 0)))


class CustomProxySource(BaseProxySource):
def load(self):
return [
Proxy(ADDRESS, TEST_SERVER_PORT, None, None, 'http'),
]


bot = TestSpider()
bot.setup_queue()
bot.load_proxylist(CustomProxySource())
bot.add_task(Task('page', url='http://yandex.ru/'))
bot.run()

self.assertEqual(self.server.request['headers']['host'], 'yandex.ru')
self.assertEqual(bot.ports, set([TEST_SERVER_PORT]))

0 comments on commit 64ba5c5

Please sign in to comment.