Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Base Spider class #3

Merged
merged 5 commits into from
Aug 14, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ __pycache__
*.egg-info
build/
dist/
.coverage*
.eggs/
28 changes: 28 additions & 0 deletions scraper_factory/core/base_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from abc import ABC, abstractmethod
from urllib.parse import urlparse
from scrapy import Spider
from scraper_factory.core.utils import validate_url


class BaseSpider(ABC, Spider):

def __init__(self, name, uri, queue, **kwargs):
if not validate_url(uri):
raise ValueError('Invalid URL')

self.name = name
parsed_url = urlparse(uri)
self.start_urls = [uri]
self.base_url = parsed_url.scheme + '://' + parsed_url.netloc
self.allowed_domains = []
self.allowed_domains.append(parsed_url.netloc)
self.allowed_domains.append(uri.split('//')[-1])

self.q = queue
super().__init__(**kwargs)

@abstractmethod
def parse(self, response):
"""This method should implement how to parse the
response data."""
pass
7 changes: 4 additions & 3 deletions scraper_factory/core/scrape.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from scrapy import crawler
from multiprocessing import Process, Queue

from scrapy import crawler
from scraper_factory.spiders.amazonwishlist import AmazonWishlistSpider
from scraper_factory.core import utils


def run_spider(url, queue):
process = crawler.CrawlerProcess(settings={
'FEED_FORMAT': 'json',
'LOG_LEVEL': 'ERROR'
'LOG_LEVEL': 'ERROR',
'USER_AGENT': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:38.0) '
'Gecko/20100101 Firefox/38.0'
})
process.crawl(AmazonWishlistSpider, url, queue)
process.start()
Expand Down
14 changes: 14 additions & 0 deletions scraper_factory/core/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from multiprocessing.queues import Queue
from urllib.parse import urlparse


def queue_to_list(q):
Expand All @@ -25,3 +26,16 @@ def remove_query_string(url):
if not isinstance(url, str):
raise TypeError('Argument must be a string')
return url.split('?')[0]


def validate_url(url):
"""
Checks that the given string contains valid url
:param url: string with a url
:return: True if url is valid, False otherwise
"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except (ValueError, TypeError, AttributeError):
return False
24 changes: 7 additions & 17 deletions scraper_factory/spiders/amazonwishlist.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,12 @@
import scrapy
import re

from scrapy import Request
from scraper_factory.core.utils import remove_query_string
from scraper_factory.core.base_spider import BaseSpider


class AmazonWishlistSpider(scrapy.Spider):
BASE_URL = 'https://www.amazon.com'
name = 'amazonwishlist'
allowed_domains = ['www.amazon.com']
class AmazonWishlistSpider(BaseSpider):

def __init__(self, uri, queue, **kwargs):
self.start_urls = [uri]
self.q = queue

domain = re.sub(r'(http|https)?://', '', uri)
self.allowed_domains.append(domain)

super().__init__(**kwargs)
super().__init__('amazonwishlist', uri, queue, **kwargs)

def parse(self, response):
page_items = response.css('.g-item-sortable')
Expand All @@ -27,7 +17,7 @@ def parse(self, response):
link = item.css('#itemName_' + id + '::attr(href)')\
.extract_first()
if link:
link = self.BASE_URL + link
link = self.base_url + link
img = item.css('#itemImage_' + id).css('img::attr(src)')\
.extract_first()

Expand All @@ -48,5 +38,5 @@ def parse(self, response):
lek_uri = response.css(
'#sort-by-price-load-more-items-url-next-batch::attr(value)')\
.extract_first()
next_page = self.BASE_URL + lek_uri
yield scrapy.Request(next_page)
next_page = self.base_url + lek_uri
yield Request(next_page)
25 changes: 25 additions & 0 deletions tests/test_spiders/test_amazonwishlistspider.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from multiprocessing import Queue
from scraper_factory.spiders.amazonwishlist import AmazonWishlistSpider
from tests.utils.spidertestbase import SpiderTestBase


Expand Down Expand Up @@ -29,3 +31,26 @@ def test_any_url(self):
url = 'https://www.amazon.com/'
results_file = 'amazon_wishlist_any_url.txt'
self.verify_url_results(url, results_file)

def test_invalid_url(self):
"""
Test response of AmazonWishlistSpider for a page
that isn't a amazon wishlist
"""
url = 'not_an_url'
results_file = 'amazon_wishlist_any_url.txt'
self.verify_url_results(url, results_file)

def test_instance_params(self):
"""
Test object parameters that set when instancing
"""
domain = 'https://'
url = 'www.amazon.com/hz/wishlist/ls/24XY9873RPAYN'
base_url = 'www.amazon.com'

sp = AmazonWishlistSpider(domain + url, Queue())
self.assertEqual(sp.base_url, domain + base_url)
self.assertEqual(sp.start_urls, [domain + url])
self.assertEqual(len(sp.allowed_domains), 2)
self.assertEqual(sp.allowed_domains, [base_url, url])
29 changes: 25 additions & 4 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from scraper_factory.core import utils

from unittest import TestCase
from multiprocessing import Queue
from time import sleep
from multiprocessing import Queue
from unittest import TestCase
from scraper_factory.core import utils


class TestCoreUtils(TestCase):
Expand Down Expand Up @@ -67,3 +66,25 @@ def test_exception_remove_query_string(self):
arg = [None, list(), 1, {}]
for a in arg:
self.assertRaises(TypeError, utils.remove_query_string, a)

def test_validate_urls(self):
"""
Tests validate_url using different strings to emulate a url
:return:
"""
url_list = [
(123, False),
({}, False),
('', False),
('aliexpress', False),
('google.com', False),
('https://www.google.com', True),
('https://www.amazon.com/hz/wishlist/ls/24XY9873RPAYN/ref=cm_go',
True),
('https://www.amazon.com/hz/wishlist/ls/24XY9873RPAYN?'
'filter=DEFAULT&viewType=list&lek=49ea-9a85-c1511c756f36',
True)
]

for url, expected_response in url_list:
self.assertEqual(utils.validate_url(url), expected_response)
1 change: 0 additions & 1 deletion tests/utils/spidertestbase.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from unittest import TestCase

from tests.utils import read_from_file
from scraper_factory.core.scrape import scrape

Expand Down