Permalink
Browse files

added the grabber's and unit-tests for 3 parsers.

  • Loading branch information...
1 parent 9e0ef68 commit d186b8ec3473a11319ab67b89881e11a478d40fb @phoenix24 committed Mar 23, 2012
@@ -6,6 +6,7 @@
from BeautifulSoup import BeautifulSoup as bsoup
from hashlib import md5
+from datetime import datetime
from pymongo.connection import Connection
@@ -86,52 +87,51 @@ def get_inventory(self):
class BaseCrawler(object):
""" the abstract crawler class! """
- def __init__ (self, config, url):
+ def __init__ (self, config):
self.config = config
self.handle = Http(".cache")
- self.url = url
+ self.page = None
self.resp = None
self.content = None
def crawl_page(self):
- self.resp, self.content = self.handle.request( self.url )
- return self.content
+ self.resp, self.content = self.handle.request( self.config["url"] )
+ self.page = {
+ "url" : self.config["url"],
+ "source" : self.config["source"],
+ "tstamp" : datetime.now(),
+ "content" : self.content,
+ "response" : self.resp
+ }
+ return self.page
class BaseGrabber(object):
"""
the abstract grabber class!
- # all the book-keeping needs to be done here.
- ## todo : store crawled urls into the db.
- ## todo : store crawled pages into the db.
- ## todo : store extracted inventory into the db.
"""
def __init__(self, config):
self.config = config
self.parser = self.config['parser']
self.crawler = self.config['crawler']
- self.db_name = "grabber"
- self.db_host = "localhost"
+ self.db_name = self.config['dbname']
+ self.db_host = self.config['dbhost']
self.connection = Connection(self.db_host)
self.db = self.connection[self.db_name]
-
-
+
def grab(self):
- # save url+crawled page.
- url = self.config['url']
- crawler = self.crawler(self.config, url)
+ crawler = self.crawler(self.config)
+ page = crawler.crawl_page()
- # save parsed content.
- parser = self.parser( crawler.crawl_page() )
-
- return parser.get_inventory()
+ #time-stamp; source; url;
+ self.db.pages.insert(page)
- def dump(self):
- inventory = self.grab()
- self.db.inventory.insert(inventory)
-
+ #parsed inventory
+ parser = self.parser(page['content'])
+ self.db.inventory.insert(parser.get_inventory())
+
@@ -47,9 +47,9 @@ def get_item_source(self):
class FlipkartCrawler(BaseCrawler):
""" flipkart inventory page crawler. """
- def __init__(self, config, url):
- super(FlipkartCrawler, self).__init__(config, url)
-
+ def __init__(self, config):
+ super(FlipkartCrawler, self).__init__(config)
+
class FlipkartGrabber(BaseGrabber):
""" flipkart page grabber """
@@ -32,8 +32,8 @@ def get_item_source(self):
class InfibeamCrawler(BaseCrawler):
""" infibeam inventory page crawler. """
- def __init__(self, config, url):
- super(InfibeamCrawler, self).__init__(config, url)
+ def __init__(self, config):
+ super(InfibeamCrawler, self).__init__(config)
class InfibeamGrabber(BaseGrabber):
@@ -30,8 +30,8 @@ def get_item_source(self):
class SaholicCrawler(BaseCrawler):
""" saholic inventory page crawler. """
- def __init__(self, config, url):
- super(SaholicCrawler, self).__init__(config, url)
+ def __init__(self, config):
+ super(SaholicCrawler, self).__init__(config)
class SaholicGrabber(BaseGrabber):
@@ -116,7 +116,7 @@ def test_get_inventory(self):
class TestFlipkartCrawler(unittest.TestCase):
def setUp(self):
- self.crawler = FlipkartCrawler({}, "http://localhost/page")
+ self.crawler = FlipkartCrawler({ "url" : "http://localhost/page" })
def tearDown(self):
self.crawler = None
@@ -128,28 +128,40 @@ def test_crawl_page(self):
class TestFlipkartGrabber(unittest.TestCase):
-
+
+ def setUp(self):
+ self.grabber = FlipkartGrabber({
+ "url" : "http://localhost/page",
+ "parser" : FlipkartInventory,
+ "crawler" : FlipkartCrawler,
+ "dbname" : "GrabberTest",
+ "dbhost" : "localhost",
+ "source" : "FKART"
+ })
+ self.grabber.db.pages.drop()
+ self.grabber.db.inventory.drop()
+
+ def tearDown(self):
+ self.grabber.db.pages.drop()
+ self.grabber.db.inventory.drop()
+
@staticmethod
def FakeResponse(a):
test = file("backend/test/data/inventory/test_20120310_055847_flipkart.html", "r").read()
- test_data = str(bsoup(test).fetch('div', 'fk-srch-item')[0])
-
+ test_data = str(bsoup(test).fetch('div', 'fk-srch-item'))
return '200 OK', test_data
-
+
@patch.object(Http, 'request', FakeResponse)
def test_inventory_grab(self):
+ # kick inventory grabber.
+ self.grabber.grab()
+
+ #assert inventory insertion
+ self.assertEquals(10, self.grabber.db.inventory.count())
- grabber = FlipkartGrabber({
- "url" : "http://localhost/page",
- "parser" : FlipkartInventory,
- "crawler" : FlipkartCrawler
- })
-
- inventory = grabber.grab()
- self.assertEquals(1, len(inventory))
-
-
+ #assert page insertion
+ self.assertEquals(1, self.grabber.db.pages.count())
if '__main__' == __name__:
@@ -95,7 +95,7 @@ def test_get_inventory(self):
class TestInfibeamCrawler(unittest.TestCase):
def setUp(self):
- self.crawler = InfibeamCrawler({}, "http://localhost/page")
+ self.crawler = InfibeamCrawler({ "url" : "http://localhost/page" })
def tearDown(self):
self.crawler = None
@@ -106,6 +106,22 @@ def test_crawl_page(self):
class TestInfibeamGrabber(unittest.TestCase):
+
+ def setUp(self):
+ self.grabber = InfibeamGrabber({
+ "url" : "http://localhost/page",
+ "parser" : InfibeamInventory,
+ "crawler" : InfibeamCrawler,
+ "dbname" : "GrabberTest",
+ "dbhost" : "localhost",
+ "source" : "IBEAM"
+ })
+ self.grabber.db.pages.drop()
+ self.grabber.db.inventory.drop()
+
+ def tearDown(self):
+ self.grabber.db.pages.drop()
+ self.grabber.db.inventory.drop()
@staticmethod
def FakeResponse(a):
@@ -114,20 +130,18 @@ def FakeResponse(a):
#monkey patching test-data to get the correct minimal test-data
test_data = str("<ul class='srch_result portrait'>" + test_data + "</ul>")
- return '200 OK', test_data
-
-
+ return '200 OK', test_data
+
@patch.object(Http, 'request', FakeResponse)
def test_inventory_grab(self):
+ # kick inventory grabber.
+ self.grabber.grab()
+
+ #assert inventory insertion
+ self.assertEquals(1, self.grabber.db.inventory.count())
- grabber = InfibeamGrabber({
- "url" : "http://localhost/page",
- "parser" : InfibeamInventory,
- "crawler" : InfibeamCrawler
- })
-
- inventory = grabber.grab()
- self.assertEquals(1, len(inventory))
+ #assert page insertion
+ self.assertEquals(1, self.grabber.db.pages.count())
if '__main__' == __name__:
@@ -96,7 +96,7 @@ def test_get_inventory(self):
class TestSaholicCrawler(unittest.TestCase):
def setUp(self):
- self.crawler = SaholicCrawler({}, "http://localhost/page")
+ self.crawler = SaholicCrawler({ "url" : "http://localhost/page" })
def tearDown(self):
self.crawler = None
@@ -107,27 +107,40 @@ def test_crawl_page(self):
class TestSaholicGrabber(unittest.TestCase):
+
+ def setUp(self):
+ self.grabber = SaholicGrabber({
+ "url" : "http://localhost/page",
+ "parser" : SaholicInventory,
+ "crawler" : SaholicCrawler,
+ "dbname" : "GrabberTest",
+ "dbhost" : "localhost",
+ "source" : "IBEAM"
+ })
+ self.grabber.db.pages.drop()
+ self.grabber.db.inventory.drop()
+
+ def tearDown(self):
+ self.grabber.db.pages.drop()
+ self.grabber.db.inventory.drop()
@staticmethod
def FakeResponse(a):
test = file("backend/test/data/inventory/test_20120310_055847_saholic.html", "r").read()
test_data = str(bsoup(test).fetch('div', 'productItem')[0])
-
return '200 OK', test_data
-
-
+
@patch.object(Http, 'request', FakeResponse)
def test_inventory_grab(self):
+ # kick inventory grabber.
+ self.grabber.grab()
+
+ #assert inventory insertion
+ self.assertEquals(1, self.grabber.db.inventory.count())
- grabber = SaholicGrabber({
- "url" : "http://localhost/page",
- "parser" : SaholicInventory,
- "crawler" : SaholicCrawler
- })
+ #assert page insertion
+ self.assertEquals(1, self.grabber.db.pages.count())
- inventory = grabber.grab()
- self.assertEquals(1, len(inventory))
-
if '__main__' == __name__:
unittest.main()

0 comments on commit d186b8e

Please sign in to comment.