Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from machinia/develop
Release 0.1.0
- Loading branch information
Showing
31 changed files
with
553 additions
and
100 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
[run] | ||
concurrency=multiprocessing |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,3 +5,8 @@ __pycache__ | |
*.log | ||
/venv | ||
.DS_Store | ||
*.egg-info | ||
build/ | ||
dist/ | ||
.coverage* | ||
.eggs/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
language: python | ||
python: | ||
- '3.6' | ||
# command to run tests | ||
script: | ||
- 'python setup.py test' | ||
after_success: | ||
- pip install python-coveralls | ||
- coveralls |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2019 machinia | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
include LICENSE | ||
include README.rst | ||
include MANIFEST.in | ||
include setup.cfg | ||
include setup.py | ||
|
||
recursive-include requirements *.txt | ||
graft scraper_factory | ||
|
||
recursive-exclude __pycache__ * | ||
global-exclude *.pyc *.pyo *.swp *.swo | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
Scraping Factory | ||
================ | ||
|
||
Scraping library to retrieve data from useful pages, such as Amazon wishlists | ||
|
||
|
||
Usage | ||
----- | ||
|
||
.. code:: python | ||
import scraper_factory | ||
scraper_factory.scrape(<<URL>>) | ||
Sample output: | ||
|
||
.. code:: sh | ||
[{ | ||
'id': 'I2WF7234C0ZXFV', | ||
'title': 'AeroPress Coffee and Espresso Maker - Quickly Makes Delicious Coffee without Bitterness - 1 to 3 Cups Per Pressing', | ||
'link': '/dp/B0047BIWSK/?coliid=I2WF7234C0ZXFV&colid=2DZOVHLU6U46&psc=1&ref_=lv_vv_lig_dp_it', | ||
'img': 'https://images-na.ssl-images-amazon.com/images/I/71Ud9NwXRpL._SS135_.jpg' | ||
}, { | ||
'id': 'I20ASZC8L6WX2V', | ||
'title': 'POP! Animation: Rick and Morty - Noob Noob 💛Limited Edition', | ||
'link': '/dp/B07STXB2JT/?coliid=I20ASZC8L6WX2V&colid=2DZOVHLU6U46&psc=1&ref_=lv_vv_lig_dp_it', | ||
'img': 'https://images-na.ssl-images-amazon.com/images/I/61Hde8rm2qL._SS135_.jpg' | ||
}, { | ||
'id': 'I1JVIE5MZQ8JVC', | ||
'title': 'Logitech Easy‑Switch K811 Wireless Bluetooth Keyboard for Mac, iPad, iPhone, Apple TV', | ||
'link': '/dp/B0099SMFP2/?coliid=I1JVIE5MZQ8JVC&colid=2DZOVHLU6U46&psc=0&ref_=lv_vv_lig_dp_it', | ||
'img': 'https://images-na.ssl-images-amazon.com/images/I/81InlOFJ-LL._SS135_.jpg' | ||
}, { | ||
'id': 'I399YP2BTOB0IL', | ||
'title': 'USB Type C to HDMI Digital AV Multiport Hub, USB-C (USB3.1) Adapter PD Charger for Nintendo Switch,Portable 4K HDMI Dock for Samsung Dex Station S10/9/8/Note8/9/Tab S4/S5,MacBook Pro/Air 2018,iPad Pro', | ||
'link': '/dp/B07JK9DFKH/?coliid=I399YP2BTOB0IL&colid=2DZOVHLU6U46&psc=1&ref_=lv_vv_lig_dp_it', | ||
'img': 'https://images-na.ssl-images-amazon.com/images/I/61mcv6tD1eL._SS135_.jpg' | ||
}] | ||
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
green>=2, <3 | ||
testfixtures>=6.3, <7 | ||
pycodestyle |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from scraper_factory.core.scrape import scrape |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
from abc import ABC, abstractmethod | ||
from urllib.parse import urlparse | ||
from scrapy import Spider | ||
from scraper_factory.core.utils import validate_url | ||
|
||
|
||
class BaseSpider(ABC, Spider): | ||
|
||
def __init__(self, name, uri, queue, **kwargs): | ||
if not validate_url(uri): | ||
raise ValueError('Invalid URL') | ||
|
||
self.name = name | ||
parsed_url = urlparse(uri) | ||
self.start_urls = [uri] | ||
self.base_url = parsed_url.scheme + '://' + parsed_url.netloc | ||
self.allowed_domains = [] | ||
self.allowed_domains.append(parsed_url.netloc) | ||
self.allowed_domains.append(uri.split('//')[-1]) | ||
|
||
self.q = queue | ||
super().__init__(**kwargs) | ||
|
||
@abstractmethod | ||
def parse(self, response): | ||
"""This method should implement how to parse the | ||
response data.""" | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from multiprocessing import Process, Queue | ||
from scrapy import crawler | ||
from scraper_factory.spiders.amazonwishlist import AmazonWishlistSpider | ||
from scraper_factory.core import utils | ||
|
||
|
||
def run_spider(url, queue): | ||
process = crawler.CrawlerProcess(settings={ | ||
'FEED_FORMAT': 'json', | ||
'LOG_LEVEL': 'ERROR', | ||
'USER_AGENT': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:38.0) ' | ||
'Gecko/20100101 Firefox/38.0' | ||
}) | ||
process.crawl(AmazonWishlistSpider, url, queue) | ||
process.start() | ||
|
||
|
||
def scrape(url): | ||
q = Queue() | ||
p = Process(target=run_spider, args=(url, q)) | ||
p.start() | ||
p.join() | ||
return utils.queue_to_list(q) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
from multiprocessing.queues import Queue | ||
from urllib.parse import urlparse | ||
|
||
|
||
def queue_to_list(q): | ||
""" | ||
Transforms a multiprocessing queue into an array | ||
:param q: multiprocessing Queue object | ||
:return: list with the elements from the queue | ||
""" | ||
if not isinstance(q, Queue): | ||
raise TypeError('Argument must be a multiprocessing Queue') | ||
|
||
arr = [] | ||
while not q.empty(): | ||
arr.append(q.get()) | ||
return arr | ||
|
||
|
||
def remove_query_string(url): | ||
""" | ||
Removes query string from a url | ||
:param url: string with a url | ||
:return: clean base url | ||
""" | ||
if not isinstance(url, str): | ||
raise TypeError('Argument must be a string') | ||
return url.split('?')[0] | ||
|
||
|
||
def validate_url(url): | ||
""" | ||
Checks that the given string contains valid url | ||
:param url: string with a url | ||
:return: True if url is valid, False otherwise | ||
""" | ||
try: | ||
result = urlparse(url) | ||
return all([result.scheme, result.netloc]) | ||
except (ValueError, TypeError, AttributeError): | ||
return False |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from scrapy import Request | ||
from scraper_factory.core.utils import remove_query_string | ||
from scraper_factory.core.base_spider import BaseSpider | ||
|
||
|
||
class AmazonWishlistSpider(BaseSpider): | ||
|
||
def __init__(self, uri, queue, **kwargs): | ||
super().__init__('amazonwishlist', uri, queue, **kwargs) | ||
|
||
def parse(self, response): | ||
page_items = response.css('.g-item-sortable') | ||
|
||
for item in page_items: | ||
id = item.css('li::attr(data-itemid)').extract_first() | ||
title = item.css('#itemName_' + id + '::text').extract_first() | ||
link = item.css('#itemName_' + id + '::attr(href)')\ | ||
.extract_first() | ||
if link: | ||
link = self.base_url + link | ||
img = item.css('#itemImage_' + id).css('img::attr(src)')\ | ||
.extract_first() | ||
|
||
obj = { | ||
'id': id, | ||
'title': title, | ||
'link': remove_query_string(link), | ||
'img': remove_query_string(img) | ||
} | ||
|
||
self.q.put(obj) | ||
yield obj | ||
|
||
# manage "infinite scrolldown" | ||
has_next = response.css('#sort-by-price-next-batch-lek' | ||
'::attr(value)').extract_first() | ||
if has_next: | ||
lek_uri = response.css( | ||
'#sort-by-price-load-more-items-url-next-batch::attr(value)')\ | ||
.extract_first() | ||
next_page = self.base_url + lek_uri | ||
yield Request(next_page) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
[aliases] | ||
test = green -vv -r --omit=".eggs*,*/__init__.py" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import os | ||
import codecs | ||
from setuptools import setup, find_packages | ||
|
||
here = os.path.abspath(os.path.dirname(__file__)) | ||
|
||
|
||
def read(*folders): | ||
with codecs.open(os.path.join(here, *folders), encoding='utf-8') as f: | ||
return f.read() | ||
|
||
|
||
def get_requirements(file_name): | ||
requires_file = read('requirements', file_name) | ||
return requires_file.splitlines() | ||
|
||
|
||
long_description = read('README.rst') | ||
|
||
setup( | ||
name='scraper-factory', | ||
|
||
version='0.1.0', | ||
|
||
description='Scraping library to retrieve data from useful pages, such as Amazon wishlists', | ||
long_description=long_description, | ||
|
||
url='https://github.com/machinia/scraper-factory', | ||
|
||
author='Pablo Ahumada, Jorge Capona', | ||
author_email='pablo.ahumadadiaz@gmail.com, jcapona@gmail.com', | ||
|
||
license='MIT', | ||
|
||
classifiers=[ | ||
'Development Status :: 3 - Alpha', | ||
'Intended Audience :: Developers', | ||
'Topic :: Software Development', | ||
'Topic :: System', | ||
'License :: OSI Approved :: MIT License', | ||
'Programming Language :: Python :: 3.6', | ||
], | ||
keywords='scraping wishlist amazon', | ||
packages=find_packages(exclude=['contrib', 'docs', 'tests']), | ||
install_requires=get_requirements('default.txt'), | ||
extras_require={}, | ||
package_data={}, | ||
data_files=[], | ||
entry_points={}, | ||
test_suite='tests', | ||
setup_requires=get_requirements('tests.txt'), | ||
) |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.