Merge pull request #4 from machinia/develop

Release 0.1.0
machinia · Aug 14, 2019 · 81247a4 · 81247a4
2 parents f297d5e + 75dfc67
commit 81247a4
Show file tree

Hide file tree

Showing 31 changed files with 553 additions and 100 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,2 @@
+[run]
+concurrency=multiprocessing
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,8 @@ __pycache__
 *.log
 /venv
 .DS_Store
+*.egg-info
+build/
+dist/
+.coverage*
+.eggs/
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,9 @@
+language: python
+python:
+    - '3.6'
+# command to run tests
+script:
+    - 'python setup.py test'
+after_success:
+    - pip install python-coveralls
+    - coveralls
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 machinia
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,12 @@
+include LICENSE
+include README.rst
+include MANIFEST.in
+include setup.cfg
+include setup.py
+
+recursive-include requirements *.txt
+graft scraper_factory
+
+recursive-exclude __pycache__ *
+global-exclude *.pyc *.pyo *.swp *.swo
+
diff --git a/README.md b/README.md
diff --git a/README.rst b/README.rst
@@ -0,0 +1,40 @@
+Scraping Factory
+================
+
+Scraping library to retrieve data from useful pages, such as Amazon wishlists
+
+
+Usage
+-----
+
+.. code:: python
+
+    import scraper_factory
+    scraper_factory.scrape(<<URL>>)
+
+Sample output:
+
+.. code:: sh
+
+    [{
+        'id': 'I2WF7234C0ZXFV',
+        'title': 'AeroPress Coffee and Espresso Maker - Quickly Makes Delicious Coffee without Bitterness - 1 to 3 Cups Per Pressing',
+        'link': '/dp/B0047BIWSK/?coliid=I2WF7234C0ZXFV&colid=2DZOVHLU6U46&psc=1&ref_=lv_vv_lig_dp_it',
+        'img': 'https://images-na.ssl-images-amazon.com/images/I/71Ud9NwXRpL._SS135_.jpg'
+    }, {
+        'id': 'I20ASZC8L6WX2V',
+        'title': 'POP! Animation: Rick and Morty - Noob Noob 💛Limited Edition',
+        'link': '/dp/B07STXB2JT/?coliid=I20ASZC8L6WX2V&colid=2DZOVHLU6U46&psc=1&ref_=lv_vv_lig_dp_it',
+        'img': 'https://images-na.ssl-images-amazon.com/images/I/61Hde8rm2qL._SS135_.jpg'
+    }, {
+        'id': 'I1JVIE5MZQ8JVC',
+        'title': 'Logitech Easy‑Switch K811 Wireless Bluetooth Keyboard for Mac, iPad, iPhone, Apple TV',
+        'link': '/dp/B0099SMFP2/?coliid=I1JVIE5MZQ8JVC&colid=2DZOVHLU6U46&psc=0&ref_=lv_vv_lig_dp_it',
+        'img': 'https://images-na.ssl-images-amazon.com/images/I/81InlOFJ-LL._SS135_.jpg'
+    }, {
+        'id': 'I399YP2BTOB0IL',
+        'title': 'USB Type C to HDMI Digital AV Multiport Hub, USB-C (USB3.1) Adapter PD Charger for Nintendo Switch,Portable 4K HDMI Dock for Samsung Dex Station S10/9/8/Note8/9/Tab S4/S5,MacBook Pro/Air 2018,iPad Pro',
+        'link': '/dp/B07JK9DFKH/?coliid=I399YP2BTOB0IL&colid=2DZOVHLU6U46&psc=1&ref_=lv_vv_lig_dp_it',
+        'img': 'https://images-na.ssl-images-amazon.com/images/I/61mcv6tD1eL._SS135_.jpg'
+    }]
+
diff --git a/requirements.txt → requirements/default.txt b/requirements.txt → requirements/default.txt
diff --git a/requirements/tests.txt b/requirements/tests.txt
@@ -0,0 +1,3 @@
+green>=2, <3
+testfixtures>=6.3, <7
+pycodestyle
diff --git a/scraper.py b/scraper.py
diff --git a/scraper_factory/__init__.py b/scraper_factory/__init__.py
@@ -0,0 +1 @@
+from scraper_factory.core.scrape import scrape
diff --git a/scraper_factory/core/__init__.py b/scraper_factory/core/__init__.py
diff --git a/scraper_factory/core/base_spider.py b/scraper_factory/core/base_spider.py
@@ -0,0 +1,28 @@
+from abc import ABC, abstractmethod
+from urllib.parse import urlparse
+from scrapy import Spider
+from scraper_factory.core.utils import validate_url
+
+
+class BaseSpider(ABC, Spider):
+
+    def __init__(self, name, uri, queue, **kwargs):
+        if not validate_url(uri):
+            raise ValueError('Invalid URL')
+
+        self.name = name
+        parsed_url = urlparse(uri)
+        self.start_urls = [uri]
+        self.base_url = parsed_url.scheme + '://' + parsed_url.netloc
+        self.allowed_domains = []
+        self.allowed_domains.append(parsed_url.netloc)
+        self.allowed_domains.append(uri.split('//')[-1])
+
+        self.q = queue
+        super().__init__(**kwargs)
+
+    @abstractmethod
+    def parse(self, response):
+        """This method should implement how to parse the
+         response data."""
+        pass
diff --git a/scraper_factory/core/scrape.py b/scraper_factory/core/scrape.py
@@ -0,0 +1,23 @@
+from multiprocessing import Process, Queue
+from scrapy import crawler
+from scraper_factory.spiders.amazonwishlist import AmazonWishlistSpider
+from scraper_factory.core import utils
+
+
+def run_spider(url, queue):
+    process = crawler.CrawlerProcess(settings={
+        'FEED_FORMAT': 'json',
+        'LOG_LEVEL': 'ERROR',
+        'USER_AGENT': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:38.0) '
+                      'Gecko/20100101 Firefox/38.0'
+    })
+    process.crawl(AmazonWishlistSpider, url, queue)
+    process.start()
+
+
+def scrape(url):
+    q = Queue()
+    p = Process(target=run_spider, args=(url, q))
+    p.start()
+    p.join()
+    return utils.queue_to_list(q)
diff --git a/scraper_factory/core/utils.py b/scraper_factory/core/utils.py
@@ -0,0 +1,41 @@
+from multiprocessing.queues import Queue
+from urllib.parse import urlparse
+
+
+def queue_to_list(q):
+    """
+    Transforms a multiprocessing queue into an array
+    :param q: multiprocessing Queue object
+    :return: list with the elements from the queue
+    """
+    if not isinstance(q, Queue):
+        raise TypeError('Argument must be a multiprocessing Queue')
+
+    arr = []
+    while not q.empty():
+        arr.append(q.get())
+    return arr
+
+
+def remove_query_string(url):
+    """
+    Removes query string from a url
+    :param url: string with a url
+    :return: clean base url
+    """
+    if not isinstance(url, str):
+        raise TypeError('Argument must be a string')
+    return url.split('?')[0]
+
+
+def validate_url(url):
+    """
+    Checks that the given string contains valid url
+    :param url: string with a url
+    :return: True if url is valid, False otherwise
+    """
+    try:
+        result = urlparse(url)
+        return all([result.scheme, result.netloc])
+    except (ValueError, TypeError, AttributeError):
+        return False
diff --git a/scraper_factory/spiders/__init__.py b/scraper_factory/spiders/__init__.py
diff --git a/scraper_factory/spiders/amazonwishlist.py b/scraper_factory/spiders/amazonwishlist.py
@@ -0,0 +1,42 @@
+from scrapy import Request
+from scraper_factory.core.utils import remove_query_string
+from scraper_factory.core.base_spider import BaseSpider
+
+
+class AmazonWishlistSpider(BaseSpider):
+
+    def __init__(self, uri, queue, **kwargs):
+        super().__init__('amazonwishlist', uri, queue, **kwargs)
+
+    def parse(self, response):
+        page_items = response.css('.g-item-sortable')
+
+        for item in page_items:
+            id = item.css('li::attr(data-itemid)').extract_first()
+            title = item.css('#itemName_' + id + '::text').extract_first()
+            link = item.css('#itemName_' + id + '::attr(href)')\
+                .extract_first()
+            if link:
+                link = self.base_url + link
+            img = item.css('#itemImage_' + id).css('img::attr(src)')\
+                .extract_first()
+
+            obj = {
+                'id': id,
+                'title': title,
+                'link': remove_query_string(link),
+                'img': remove_query_string(img)
+            }
+
+            self.q.put(obj)
+            yield obj
+
+        # manage "infinite scrolldown"
+        has_next = response.css('#sort-by-price-next-batch-lek'
+                                '::attr(value)').extract_first()
+        if has_next:
+            lek_uri = response.css(
+                '#sort-by-price-load-more-items-url-next-batch::attr(value)')\
+                .extract_first()
+            next_page = self.base_url + lek_uri
+            yield Request(next_page)
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,2 @@
+[aliases]
+test = green -vv -r --omit=".eggs*,*/__init__.py"
diff --git a/setup.py b/setup.py
@@ -0,0 +1,52 @@
+import os
+import codecs
+from setuptools import setup, find_packages
+
+here = os.path.abspath(os.path.dirname(__file__))
+
+
+def read(*folders):
+    with codecs.open(os.path.join(here, *folders), encoding='utf-8') as f:
+        return f.read()
+
+
+def get_requirements(file_name):
+    requires_file = read('requirements', file_name)
+    return requires_file.splitlines()
+
+
+long_description = read('README.rst')
+
+setup(
+    name='scraper-factory',
+
+    version='0.1.0',
+
+    description='Scraping library to retrieve data from useful pages, such as Amazon wishlists',
+    long_description=long_description,
+
+    url='https://github.com/machinia/scraper-factory',
+
+    author='Pablo Ahumada, Jorge Capona',
+    author_email='pablo.ahumadadiaz@gmail.com, jcapona@gmail.com',
+
+    license='MIT',
+
+    classifiers=[
+        'Development Status :: 3 - Alpha',
+        'Intended Audience :: Developers',
+        'Topic :: Software Development',
+        'Topic :: System',
+        'License :: OSI Approved :: MIT License',
+        'Programming Language :: Python :: 3.6',
+    ],
+    keywords='scraping wishlist amazon',
+    packages=find_packages(exclude=['contrib', 'docs', 'tests']),
+    install_requires=get_requirements('default.txt'),
+    extras_require={},
+    package_data={},
+    data_files=[],
+    entry_points={},
+    test_suite='tests',
+    setup_requires=get_requirements('tests.txt'),
+)
diff --git a/spiders/amazonwishlist.py b/spiders/amazonwishlist.py