Skip to content

Commit

Permalink
Merge pull request #3 from machinia/base-spider
Browse files Browse the repository at this point in the history
Base Spider class
  • Loading branch information
jcapona committed Aug 14, 2019
2 parents 6eefcfa + 08e36b3 commit f14a498
Show file tree
Hide file tree
Showing 8 changed files with 105 additions and 25 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ __pycache__
*.egg-info
build/
dist/
.coverage*
.eggs/
28 changes: 28 additions & 0 deletions scraper_factory/core/base_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from abc import ABC, abstractmethod
from urllib.parse import urlparse
from scrapy import Spider
from scraper_factory.core.utils import validate_url


class BaseSpider(ABC, Spider):

def __init__(self, name, uri, queue, **kwargs):
if not validate_url(uri):
raise ValueError('Invalid URL')

self.name = name
parsed_url = urlparse(uri)
self.start_urls = [uri]
self.base_url = parsed_url.scheme + '://' + parsed_url.netloc
self.allowed_domains = []
self.allowed_domains.append(parsed_url.netloc)
self.allowed_domains.append(uri.split('//')[-1])

self.q = queue
super().__init__(**kwargs)

@abstractmethod
def parse(self, response):
"""This method should implement how to parse the
response data."""
pass
7 changes: 4 additions & 3 deletions scraper_factory/core/scrape.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from scrapy import crawler
from multiprocessing import Process, Queue

from scrapy import crawler
from scraper_factory.spiders.amazonwishlist import AmazonWishlistSpider
from scraper_factory.core import utils


def run_spider(url, queue):
process = crawler.CrawlerProcess(settings={
'FEED_FORMAT': 'json',
'LOG_LEVEL': 'ERROR'
'LOG_LEVEL': 'ERROR',
'USER_AGENT': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:38.0) '
'Gecko/20100101 Firefox/38.0'
})
process.crawl(AmazonWishlistSpider, url, queue)
process.start()
Expand Down
14 changes: 14 additions & 0 deletions scraper_factory/core/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from multiprocessing.queues import Queue
from urllib.parse import urlparse


def queue_to_list(q):
Expand All @@ -25,3 +26,16 @@ def remove_query_string(url):
if not isinstance(url, str):
raise TypeError('Argument must be a string')
return url.split('?')[0]


def validate_url(url):
"""
Checks that the given string contains valid url
:param url: string with a url
:return: True if url is valid, False otherwise
"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except (ValueError, TypeError, AttributeError):
return False
24 changes: 7 additions & 17 deletions scraper_factory/spiders/amazonwishlist.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,12 @@
import scrapy
import re

from scrapy import Request
from scraper_factory.core.utils import remove_query_string
from scraper_factory.core.base_spider import BaseSpider


class AmazonWishlistSpider(scrapy.Spider):
BASE_URL = 'https://www.amazon.com'
name = 'amazonwishlist'
allowed_domains = ['www.amazon.com']
class AmazonWishlistSpider(BaseSpider):

def __init__(self, uri, queue, **kwargs):
self.start_urls = [uri]
self.q = queue

domain = re.sub(r'(http|https)?://', '', uri)
self.allowed_domains.append(domain)

super().__init__(**kwargs)
super().__init__('amazonwishlist', uri, queue, **kwargs)

def parse(self, response):
page_items = response.css('.g-item-sortable')
Expand All @@ -27,7 +17,7 @@ def parse(self, response):
link = item.css('#itemName_' + id + '::attr(href)')\
.extract_first()
if link:
link = self.BASE_URL + link
link = self.base_url + link
img = item.css('#itemImage_' + id).css('img::attr(src)')\
.extract_first()

Expand All @@ -48,5 +38,5 @@ def parse(self, response):
lek_uri = response.css(
'#sort-by-price-load-more-items-url-next-batch::attr(value)')\
.extract_first()
next_page = self.BASE_URL + lek_uri
yield scrapy.Request(next_page)
next_page = self.base_url + lek_uri
yield Request(next_page)
25 changes: 25 additions & 0 deletions tests/test_spiders/test_amazonwishlistspider.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from multiprocessing import Queue
from scraper_factory.spiders.amazonwishlist import AmazonWishlistSpider
from tests.utils.spidertestbase import SpiderTestBase


Expand Down Expand Up @@ -29,3 +31,26 @@ def test_any_url(self):
url = 'https://www.amazon.com/'
results_file = 'amazon_wishlist_any_url.txt'
self.verify_url_results(url, results_file)

def test_invalid_url(self):
"""
Test response of AmazonWishlistSpider for a page
that isn't a amazon wishlist
"""
url = 'not_an_url'
results_file = 'amazon_wishlist_any_url.txt'
self.verify_url_results(url, results_file)

def test_instance_params(self):
"""
Test object parameters that set when instancing
"""
domain = 'https://'
url = 'www.amazon.com/hz/wishlist/ls/24XY9873RPAYN'
base_url = 'www.amazon.com'

sp = AmazonWishlistSpider(domain + url, Queue())
self.assertEqual(sp.base_url, domain + base_url)
self.assertEqual(sp.start_urls, [domain + url])
self.assertEqual(len(sp.allowed_domains), 2)
self.assertEqual(sp.allowed_domains, [base_url, url])
29 changes: 25 additions & 4 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from scraper_factory.core import utils

from unittest import TestCase
from multiprocessing import Queue
from time import sleep
from multiprocessing import Queue
from unittest import TestCase
from scraper_factory.core import utils


class TestCoreUtils(TestCase):
Expand Down Expand Up @@ -67,3 +66,25 @@ def test_exception_remove_query_string(self):
arg = [None, list(), 1, {}]
for a in arg:
self.assertRaises(TypeError, utils.remove_query_string, a)

def test_validate_urls(self):
"""
Tests validate_url using different strings to emulate a url
:return:
"""
url_list = [
(123, False),
({}, False),
('', False),
('aliexpress', False),
('google.com', False),
('https://www.google.com', True),
('https://www.amazon.com/hz/wishlist/ls/24XY9873RPAYN/ref=cm_go',
True),
('https://www.amazon.com/hz/wishlist/ls/24XY9873RPAYN?'
'filter=DEFAULT&viewType=list&lek=49ea-9a85-c1511c756f36',
True)
]

for url, expected_response in url_list:
self.assertEqual(utils.validate_url(url), expected_response)
1 change: 0 additions & 1 deletion tests/utils/spidertestbase.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from unittest import TestCase

from tests.utils import read_from_file
from scraper_factory.core.scrape import scrape

Expand Down

0 comments on commit f14a498

Please sign in to comment.