Skip to content

Commit

Permalink
Merge pull request #4 from machinia/develop
Browse files Browse the repository at this point in the history
Release 0.1.0
  • Loading branch information
jcapona committed Aug 14, 2019
2 parents f297d5e + 75dfc67 commit 81247a4
Show file tree
Hide file tree
Showing 31 changed files with 553 additions and 100 deletions.
2 changes: 2 additions & 0 deletions .coveragerc
@@ -0,0 +1,2 @@
[run]
concurrency=multiprocessing
5 changes: 5 additions & 0 deletions .gitignore
Expand Up @@ -5,3 +5,8 @@ __pycache__
*.log
/venv
.DS_Store
*.egg-info
build/
dist/
.coverage*
.eggs/
9 changes: 9 additions & 0 deletions .travis.yml
@@ -0,0 +1,9 @@
language: python
python:
- '3.6'
# command to run tests
script:
- 'python setup.py test'
after_success:
- pip install python-coveralls
- coveralls
21 changes: 21 additions & 0 deletions LICENSE
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2019 machinia

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
12 changes: 12 additions & 0 deletions MANIFEST.in
@@ -0,0 +1,12 @@
include LICENSE
include README.rst
include MANIFEST.in
include setup.cfg
include setup.py

recursive-include requirements *.txt
graft scraper_factory

recursive-exclude __pycache__ *
global-exclude *.pyc *.pyo *.swp *.swo

31 changes: 0 additions & 31 deletions README.md

This file was deleted.

40 changes: 40 additions & 0 deletions README.rst
@@ -0,0 +1,40 @@
Scraping Factory
================

Scraping library to retrieve data from useful pages, such as Amazon wishlists


Usage
-----

.. code:: python
import scraper_factory
scraper_factory.scrape(<<URL>>)
Sample output:

.. code:: sh
[{
'id': 'I2WF7234C0ZXFV',
'title': 'AeroPress Coffee and Espresso Maker - Quickly Makes Delicious Coffee without Bitterness - 1 to 3 Cups Per Pressing',
'link': '/dp/B0047BIWSK/?coliid=I2WF7234C0ZXFV&colid=2DZOVHLU6U46&psc=1&ref_=lv_vv_lig_dp_it',
'img': 'https://images-na.ssl-images-amazon.com/images/I/71Ud9NwXRpL._SS135_.jpg'
}, {
'id': 'I20ASZC8L6WX2V',
'title': 'POP! Animation: Rick and Morty - Noob Noob 💛Limited Edition',
'link': '/dp/B07STXB2JT/?coliid=I20ASZC8L6WX2V&colid=2DZOVHLU6U46&psc=1&ref_=lv_vv_lig_dp_it',
'img': 'https://images-na.ssl-images-amazon.com/images/I/61Hde8rm2qL._SS135_.jpg'
}, {
'id': 'I1JVIE5MZQ8JVC',
'title': 'Logitech Easy‑Switch K811 Wireless Bluetooth Keyboard for Mac, iPad, iPhone, Apple TV',
'link': '/dp/B0099SMFP2/?coliid=I1JVIE5MZQ8JVC&colid=2DZOVHLU6U46&psc=0&ref_=lv_vv_lig_dp_it',
'img': 'https://images-na.ssl-images-amazon.com/images/I/81InlOFJ-LL._SS135_.jpg'
}, {
'id': 'I399YP2BTOB0IL',
'title': 'USB Type C to HDMI Digital AV Multiport Hub, USB-C (USB3.1) Adapter PD Charger for Nintendo Switch,Portable 4K HDMI Dock for Samsung Dex Station S10/9/8/Note8/9/Tab S4/S5,MacBook Pro/Air 2018,iPad Pro',
'link': '/dp/B07JK9DFKH/?coliid=I399YP2BTOB0IL&colid=2DZOVHLU6U46&psc=1&ref_=lv_vv_lig_dp_it',
'img': 'https://images-na.ssl-images-amazon.com/images/I/61mcv6tD1eL._SS135_.jpg'
}]
File renamed without changes.
3 changes: 3 additions & 0 deletions requirements/tests.txt
@@ -0,0 +1,3 @@
green>=2, <3
testfixtures>=6.3, <7
pycodestyle
14 changes: 0 additions & 14 deletions scraper.py

This file was deleted.

1 change: 1 addition & 0 deletions scraper_factory/__init__.py
@@ -0,0 +1 @@
from scraper_factory.core.scrape import scrape
Empty file.
28 changes: 28 additions & 0 deletions scraper_factory/core/base_spider.py
@@ -0,0 +1,28 @@
from abc import ABC, abstractmethod
from urllib.parse import urlparse
from scrapy import Spider
from scraper_factory.core.utils import validate_url


class BaseSpider(ABC, Spider):

def __init__(self, name, uri, queue, **kwargs):
if not validate_url(uri):
raise ValueError('Invalid URL')

self.name = name
parsed_url = urlparse(uri)
self.start_urls = [uri]
self.base_url = parsed_url.scheme + '://' + parsed_url.netloc
self.allowed_domains = []
self.allowed_domains.append(parsed_url.netloc)
self.allowed_domains.append(uri.split('//')[-1])

self.q = queue
super().__init__(**kwargs)

@abstractmethod
def parse(self, response):
"""This method should implement how to parse the
response data."""
pass
23 changes: 23 additions & 0 deletions scraper_factory/core/scrape.py
@@ -0,0 +1,23 @@
from multiprocessing import Process, Queue
from scrapy import crawler
from scraper_factory.spiders.amazonwishlist import AmazonWishlistSpider
from scraper_factory.core import utils


def run_spider(url, queue):
process = crawler.CrawlerProcess(settings={
'FEED_FORMAT': 'json',
'LOG_LEVEL': 'ERROR',
'USER_AGENT': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:38.0) '
'Gecko/20100101 Firefox/38.0'
})
process.crawl(AmazonWishlistSpider, url, queue)
process.start()


def scrape(url):
q = Queue()
p = Process(target=run_spider, args=(url, q))
p.start()
p.join()
return utils.queue_to_list(q)
41 changes: 41 additions & 0 deletions scraper_factory/core/utils.py
@@ -0,0 +1,41 @@
from multiprocessing.queues import Queue
from urllib.parse import urlparse


def queue_to_list(q):
"""
Transforms a multiprocessing queue into an array
:param q: multiprocessing Queue object
:return: list with the elements from the queue
"""
if not isinstance(q, Queue):
raise TypeError('Argument must be a multiprocessing Queue')

arr = []
while not q.empty():
arr.append(q.get())
return arr


def remove_query_string(url):
"""
Removes query string from a url
:param url: string with a url
:return: clean base url
"""
if not isinstance(url, str):
raise TypeError('Argument must be a string')
return url.split('?')[0]


def validate_url(url):
"""
Checks that the given string contains valid url
:param url: string with a url
:return: True if url is valid, False otherwise
"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except (ValueError, TypeError, AttributeError):
return False
Empty file.
42 changes: 42 additions & 0 deletions scraper_factory/spiders/amazonwishlist.py
@@ -0,0 +1,42 @@
from scrapy import Request
from scraper_factory.core.utils import remove_query_string
from scraper_factory.core.base_spider import BaseSpider


class AmazonWishlistSpider(BaseSpider):

def __init__(self, uri, queue, **kwargs):
super().__init__('amazonwishlist', uri, queue, **kwargs)

def parse(self, response):
page_items = response.css('.g-item-sortable')

for item in page_items:
id = item.css('li::attr(data-itemid)').extract_first()
title = item.css('#itemName_' + id + '::text').extract_first()
link = item.css('#itemName_' + id + '::attr(href)')\
.extract_first()
if link:
link = self.base_url + link
img = item.css('#itemImage_' + id).css('img::attr(src)')\
.extract_first()

obj = {
'id': id,
'title': title,
'link': remove_query_string(link),
'img': remove_query_string(img)
}

self.q.put(obj)
yield obj

# manage "infinite scrolldown"
has_next = response.css('#sort-by-price-next-batch-lek'
'::attr(value)').extract_first()
if has_next:
lek_uri = response.css(
'#sort-by-price-load-more-items-url-next-batch::attr(value)')\
.extract_first()
next_page = self.base_url + lek_uri
yield Request(next_page)
2 changes: 2 additions & 0 deletions setup.cfg
@@ -0,0 +1,2 @@
[aliases]
test = green -vv -r --omit=".eggs*,*/__init__.py"
52 changes: 52 additions & 0 deletions setup.py
@@ -0,0 +1,52 @@
import os
import codecs
from setuptools import setup, find_packages

here = os.path.abspath(os.path.dirname(__file__))


def read(*folders):
with codecs.open(os.path.join(here, *folders), encoding='utf-8') as f:
return f.read()


def get_requirements(file_name):
requires_file = read('requirements', file_name)
return requires_file.splitlines()


long_description = read('README.rst')

setup(
name='scraper-factory',

version='0.1.0',

description='Scraping library to retrieve data from useful pages, such as Amazon wishlists',
long_description=long_description,

url='https://github.com/machinia/scraper-factory',

author='Pablo Ahumada, Jorge Capona',
author_email='pablo.ahumadadiaz@gmail.com, jcapona@gmail.com',

license='MIT',

classifiers=[
'Development Status :: 3 - Alpha',
'Intended Audience :: Developers',
'Topic :: Software Development',
'Topic :: System',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3.6',
],
keywords='scraping wishlist amazon',
packages=find_packages(exclude=['contrib', 'docs', 'tests']),
install_requires=get_requirements('default.txt'),
extras_require={},
package_data={},
data_files=[],
entry_points={},
test_suite='tests',
setup_requires=get_requirements('tests.txt'),
)
44 changes: 0 additions & 44 deletions spiders/amazonwishlist.py

This file was deleted.

0 comments on commit 81247a4

Please sign in to comment.